1#!/usr/bin/perl
2#
3# htmldiff - present a diff marked version of two html documents
4#
5# Copyright (c) 1998-2006 MACS, Inc.
6#
7# Copyright (c) 2007 SiSco, Inc.
8#
9# SPDX-License-Identifier: MIT
10#
11# Permission is hereby granted, free of charge, to any person obtaining
12# a copy of this software and associated documentation files (the
13# "Software"), to deal in the Software without restriction, including
14# without limitation the rights to use, copy, modify, merge, publish,
15# distribute, sublicense, and/or sell copies of the Software, and to
16# permit persons to whom the Software is furnished to do so, subject to
17# the following conditions:
18#
19# The above copyright notice and this permission notice shall be
20# included in all copies or substantial portions of the Software.
21#
22# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
27# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29#
30# See http://www.themacs.com for more information.
31#
32# usage: htmldiff [[-c] [-l] [-o] oldversion newversion [output]]
33#
34# -c - disable metahtml comment processing
35# -o - disable outputting of old text
36# -l - use navindex to create sequence of diffs
37# oldversion - the previous version of the document
38# newversion - the newer version of the document
39# output - a filename to place the output in. If omitted, the output goes to
40#          standard output.
41#
42# if invoked with no options or arguments, operates as a CGI script. It then
43# takes the following parameters:
44#
45# oldfile - the URL of the original file
46# newfile - the URL of the new file
47# mhtml - a flag to indicate whether it should be aware of MetaHTML comments.
48#
49# requires GNU diff utility
50# also requires the perl modules Getopt::Std
51#
52# NOTE: The markup created by htmldiff may not validate against the HTML 4.0
53# DTD. This is because the algorithm is realtively simple, and there are
54# places in the markup content model where the span element is not allowed.
55# Htmldiff is NOT aware of these places.
56#
57# $Source: /u/sources/public/2009/htmldiff/htmldiff.pl,v $
58# $Revision: 1.3 $
59#
60# $Log: htmldiff.pl,v $
61# Revision 1.3  2016/10/24 15:06:51  dom
62# Summary: Use nav script always
63#
64# Revision 1.2  2016/10/24 15:04:28  dom
65# Add navigation script
66#
67# Revision 1.1  2014-01-06 08:04:51  dom
68# added copy of htmldiff perl script since aptest.com repo no longer available
69#
70# Revision 1.5  2008/03/05 13:23:16  ahby
71# Fixed a problem with leading whitespace before markup.
72#
73# Revision 1.4  2007/12/13 13:09:16  ahby
74# Updated copyright and license.
75#
76# Revision 1.3  2007/12/13 12:53:34  ahby
77# Changed use of span to ins and del
78#
79# Revision 1.2  2002/02/13 16:27:23  ahby
80# Changed processing model.
81# Improved handling of old text and changed styles.
82#
83# Revision 1.1  2000/07/12 12:20:04  ahby
84# Updated to remove empty spans - this fixes validation problems under
85# strict.
86#
87# Revision 1.11  1999/12/08 19:46:45  ahby
88# Fixed validation errors introduced by placing markup where it didn't
89# belong.
90#
91# Revision 1.10  1999/10/18 13:42:58  ahby
92# Added -o to the usage message.
93#
94# Revision 1.9  1999/05/04 12:29:11  ahby
95# Added an option to turn off the display of old text.
96#
97# Revision 1.8  1999/04/09 14:37:27  ahby
98# Fixed a perl syntax error.
99#
100# Revision 1.7  1999/04/09 14:35:49  ahby
101# Added reference to MACS homepage.
102#
103# Revision 1.6  1999/04/09 14:35:09  ahby
104# Added comment about validity of generated markup.
105#
106# Revision 1.5  1999/02/22 22:17:54  ahby
107# Changed to use stylesheets.
108# Changed to rely upon span.
109# Changed to work around content model problems.
110#
111# Revision 1.4  1999/02/08 02:32:22  ahby
112# Added a copyright statement.
113#
114# Revision 1.3  1999/02/08 02:30:40  ahby
115# Added header processing.
116#
117# Revision 1.2  1998/12/10 17:31:31  ahby
118# Fixed to escape less-thans in change blocks and to not permit change
119# markup within specific elements (like TITLE).
120#
121# Revision 1.1  1998/11/26 00:09:22  ahby
122# Initial revision
123#
124#
125
126use Getopt::Std;
127
128sub usage {
129	print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n";
130	exit;
131}
132
133sub url_encode {
134    my $str = shift;
135    $str =~ s/([\x00-\x1f\x7F-\xFF])/
136                 sprintf ('%%%02x', ord ($1))/eg;
137    return $str;
138}
139
140# markit - diff-mark the streams
141#
142# markit(file1, file2)
143#
144# markit relies upon GNUdiff to mark up the text.
145#
146# The markup is encoded using special control sequences:
147#
148#   a block wrapped in control-a is deleted text
149#   a block wrapped in control-b is old text
150#   a block wrapped in control-c is new text
151#
152# The main processing loop attempts to wrap the text blocks in appropriate
153# SPANs based upon the type of text that it is.
154#
155# When the loop encounters a < in the text, it stops the span. Then it outputs
156# the element that is defined, then it restarts the span.
157
158sub markit {
159	my $retval = "";
160	my($file1) = shift;
161	my($file2) = shift;
162#	my $old="<span class=\\\"diff-old-a\\\">deleted text: </span>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
163	my $old="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
164	my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'";
165	my $unchanged="%=";
166	my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'";
167	if ($opt_o) {
168		$old = "";
169		$changed = "%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'";
170	}
171#	my $old="%c'\002'<font color=\\\"purple\\\" size=\\\"-2\\\">deleted text:</font><s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s>%c'\012'%c'\002'";
172#	my $new="%c'\002'<font color=\\\"purple\\\"><u>%c'\012'%c'\002'%>%c'\002'</u></font>%c'\002'%c'\012'";
173#	my $unchanged="%=";
174#	my $changed="%c'\002'<s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s><font color=\\\"purple\\\"><u>%c'\002'%c'\012'%>%c'\012'%c'\002'</u></font>%c'\002'%c'\012'";
175
176	my @span;
177	$span[0]="</span>";
178	$span[1]="<del class=\"diff-old\">";
179	$span[2]="<del class=\"diff-old\">";
180	$span[3]="<ins class=\"diff-new\">";
181	$span[4]="<ins class=\"diff-chg\">";
182
183	my @diffEnd ;
184	$diffEnd[1] = '</del>';
185	$diffEnd[2] = '</del>';
186	$diffEnd[3] = '</ins>';
187	$diffEnd[4] = '</ins>';
188
189	my $diffcounter = 0;
190
191	open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!");
192#	system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output));
193
194	my $state = 0;
195	my $inblock = 0;
196	my $temp = "";
197	my $lineCount = 0;
198
199# strategy:
200#
201# process the output of diff...
202#
203# a link with control A-D means the start/end of the corresponding ordinal
204# state (1-4). Resting state is state 0.
205#
206# While in a state, accumulate the contents for that state. When exiting the
207# state, determine if it is appropriate to emit the contents with markup or
208# not (basically, if the accumulated buffer contains only empty lines or lines
209# with markup, then we don't want to emit the wrappers.  We don't need them.
210#
211# Note that if there is markup in the "old" block, that markup is silently
212# removed.  It isn't really that interesting, and it messes up the output
213# something fierce.
214
215	while (<FILE>) {
216		my $nextCounter = $diffcounter + 1;
217		my $anchor = $opt_l ? qq[<a tabindex="$diffcounter" id="diff-$diffcounter" href="#diff-$nextCounter">] : "" ;
218		my $anchorEnd = $opt_l ? q[</a>] : "" ;
219		$lineCount ++;
220		if ($state == 0) {	# if we are resting and we find a marker,
221							# then we must be entering a block
222			if (m/^([\001-\004])/) {
223				$state = ord($1);
224				$_ = "";
225			}
226#			if (m/^\001/) {
227#				$state = 1;
228#				s/^/$span[1]/;
229#			} elsif (m/^\002/) {
230#				$state = 2;
231#				s/^/$span[2]/;
232#			} elsif (m/^\003/) {
233#				$state = 3;
234#				s/^/$span[3]/;
235#			} elsif (m/^\004/) {
236#				$state = 4;
237#				s/^/$span[4]/;
238#			}
239		} else {
240			# if we are in "old" state, remove markup
241			if (($state == 1) || ($state == 2)) {
242				s/\<.*\>//;	# get rid of any old markup
243				s/\</&lt;/g; # escape any remaining STAG or ETAGs
244				s/\>/&gt;/g;
245			}
246			# if we found another marker, we must be exiting the state
247			if (m/^([\001-\004])/) {
248				if ($temp ne "") {
249					$_ = $span[$state] . $anchor . $temp . $anchorEnd . $diffEnd[$state] . "\n";
250					$temp = "";
251					$diffcounter++;
252				} else {
253					$_ = "" ;
254				}
255				$state = 0;
256			} elsif (m/^\s*\</) { # otherwise, is this line markup?
257				# if it is markup AND we haven't seen anything else yet,
258				# then we will emit the markup
259				if ($temp eq "") {
260					$retval .= $_;
261					$_ = "";
262				} else {	# we wrap it with the state switches and hold it
263					s/^/$anchorEnd$diffEnd[$state]/;
264					s/$/$span[$state]$anchor/;
265					$temp .= $_;
266					$_ = "";
267					$diffcounter++;
268				}
269			} else {
270				if (m/.+/) {
271					$temp .= $_;
272					$_ = "";
273				}
274			}
275		}
276
277		s/\001//g;
278		s/\002//g;
279		s/\003//g;
280		s/\004//g;
281		if ($_ !~ m/^$/) {
282			$retval .= $_;
283		}
284	}
285	close FILE;
286	$retval =~ s/$span[1]\n+$diffEnd[1]//g;
287	$retval =~ s/$span[2]\n+$diffEnd[2]//g;
288	$retval =~ s/$span[3]\n+$diffEnd[3]//g;
289	$retval =~ s/$span[4]\n+$diffEnd[4]//g;
290	$retval =~ s/$span[1]\n*$//g;
291	$retval =~ s/$span[2]\n*$//g;
292	$retval =~ s/$span[3]\n*$//g;
293	$retval =~ s/$span[4]\n*$//g;
294	return $retval;
295}
296
297sub splitit {
298	my $filename = shift;
299	my $headertmp = shift;
300	my $inheader=0;
301	my $preformatted=0;
302	my $inelement=0;
303	my $retval = "";
304	my $styles = q(<style type='text/css'>
305.diff-old-a {
306  font-size: smaller;
307  color: red;
308}
309.diff-new a { text-decoration: none; }
310.diff-new { background-color: yellow; }
311.diff-chg { background-color: lime; }
312.diff-chg a { text-decoration: none; }
313.diff-new:before,
314.diff-new:after
315    { content: "\2191" }
316.diff-chg:before, .diff-chg:after
317    { content: "\2195" }
318.diff-old { text-decoration: line-through; background-color: #FBB; }
319.diff-old:before,
320.diff-old:after
321    { content: "\2193" }
322.diff-old a { text-decoration: none; }
323:focus { border: thin red solid}
324</style>
325<script src="https://www.w3.org/2016/10/htmldiff-nav.js"></script>);
326	if ($opt_t) {
327		$styles .= q(
328<script type="text/javascript">
329<!--
330function setOldDisplay() {
331	for ( var s = 0; s < document.styleSheets.length; s++ ) {
332		var css = document.styleSheets[s];
333		var mydata ;
334		try { mydata = css.cssRules ;
335		if ( ! mydata ) mydata = css.rules;
336		for ( var r = 0; r < mydata.length; r++ ) {
337			if ( mydata[r].selectorText == '.diff-old' ) {
338				mydata[r].style.display = ( mydata[r].style.display == '' ) ? 'none'
339: '';
340				return;
341			}
342		}
343		} catch(e) {} ;
344	}
345}
346-->
347</script>
348);
349
350	}
351
352	if ($stripheader) {
353		open(HEADER, ">$headertmp");
354	}
355
356	my $incomment = 0;
357	my $inhead = 1;
358	open(FILE, $filename) || die("File $filename cannot be opened: $!");
359	while (<FILE>) {
360		if ($inhead == 1) {
361			if (m/\<\/head/i) {
362				print HEADER $styles;
363			}
364			if (m/\<body/i) {
365				$inhead = 0;
366				print HEADER;
367				if ($opt_t) {
368					print HEADER q(
369<form action=""><input type="button" onclick="setOldDisplay()" value="Show/Hide Old Content" /></form>
370);
371				}
372				if ($opt_l) {
373					print HEADER q(
374						<p><em>NOTE: Click highlighted diff text to jump to the following difference.</em></p>
375					);
376				}
377				close HEADER;
378			} else {
379				print HEADER;
380			}
381		} else {
382			if ($incomment) {
383				if (m;-->;) {
384					$incomment = 0;
385					s/.*-->//;
386				} else {
387					next;
388				}
389			}
390			if (m;<!--;) {
391				while (m;<!--.*-->;) {
392					s/<!--.*?-->//;
393				}
394				if (m;<!--; ) {
395					$incomment = 1;
396					s/<!--.*//;
397				}
398			}
399			if (m/\<pre/i) {
400				$preformatted = 1;
401			}
402			if (m/\<\/pre\>/i) {
403				$preformatted = 0;
404			}
405			if ($preformatted) {
406				$retval .= $_;
407			} elsif ($mhtmlcomments && /^;;;/) {
408				$retval .= $_;
409			} else {
410				my @list = split(' ');
411				foreach $element (@list) {
412					if ($element =~ m/\<H[1-6]/i) {
413#						$inheader = 1;
414					}
415					if ($inheader == 0) {
416						$element =~ s/</\n</g;
417						$element =~ s/^\n//;
418						$element =~ s/>/>\n/g;
419						$element =~ s/\n$//;
420						$element =~ s/>\n([.,:!]+)/>$1/g;
421					}
422					if ($element =~ m/\<\/H[1-6]\>/i) {
423						$inheader = 0;
424					}
425					$retval .= "$element";
426					$inelement += ($element =~ s/</&lt;/g);
427					$inelement -= ($element =~ s/>/&gt;/g);
428					if ($inelement < 0) {
429						$inelement = 0;
430					}
431					if (($inelement == 0) && ($inheader == 0)) {
432						$retval .= "\n";
433					} else {
434						$retval .= " ";
435					}
436				}
437			undef @list;
438			}
439		}
440	}
441	$retval .= "\n";
442	close FILE;
443	return $retval;
444}
445
446$mhtmlcomments = 1;
447
448sub cli {
449	getopts("clto") || usage();
450
451	if ($opt_c) {$mhtmlcomments = 0;}
452
453	if (@ARGV < 2) { usage(); }
454
455	$file1 = $ARGV[0];
456	$file2 = $ARGV[1];
457	$file3 = $ARGV[2];
458
459	$tmp = splitit($file1, $headertmp1);
460	open (FILE, ">$tmp1");
461	print FILE $tmp;
462	close FILE;
463
464	$tmp = splitit($file2, $headertmp2);
465	open (FILE, ">$tmp2");
466	print FILE $tmp;
467	close FILE;
468
469	$output = "";
470
471	if ($stripheader) {
472		open(FILE, $headertmp2);
473		while (<FILE>) {
474			$output .= $_;
475		}
476		close(FILE);
477	}
478
479	$output .= markit($tmp1, $tmp2);
480
481	if ($file3) {
482		open(FILE, ">$file3");
483		print FILE $output;
484		close FILE;
485	} else {
486		print $output;
487	}
488}
489
490sub cgi {
491#	use LWP::UserAgent;
492#	use CGI;
493
494	my $query = new CGI;
495	my $url1 = $query->param("oldfile");
496	my $url2 = $query->param("newfile");
497	my $mhtml = $query->param("mhtml");
498
499	my $file1 = "/tmp/htdcgi1.$$";
500	my $file2 = "/tmp/htdcgi2.$$";
501
502	my $ua = new LWP::UserAgent;
503	$ua->agent("MACS, Inc. HTMLdiff/0.9 " . $ua->agent);
504
505	# Create a request
506
507	my $req1 = new HTTP::Request GET => $url1;
508
509	my $res1 = $ua->request($req1, $file1);
510	if ($res1->is_error) {
511		print $res1->error_as_HTML();
512		print "<p>The URL $url1 could not be found.  Please check it and try again.</p>";
513		return;
514	}
515
516	my $req2 = new HTTP::Request GET => $url2;
517
518	my $res2 = $ua->request($req2, $file2);
519	if ($res2->is_error) {
520		print $res2->error_as_HTML();
521		print "<p>The URL $url2 could not be found.  Please check it and try again.</p>";
522		return;
523	}
524
525	$split1 = splitit($file1, $headertmp1);
526	open (FILE, ">$tmp1");
527	print FILE $split1;
528	close FILE;
529
530	$split2 = splitit($file2, $headertmp2);
531	open (FILE, ">$tmp2");
532	print FILE $split2;
533	close FILE;
534
535	$output = "";
536
537	if ($stripheader) {
538		open(FILE, $headertmp2);
539		while (<FILE>) {
540			$output .= $_;
541		}
542		close(FILE);
543	}
544
545	$output .= markit($tmp1, $tmp2);
546
547	my $base=$res2->base;
548
549	if ($base !~ /\/$/) {
550		$base =~ s/[^\/]*$//;
551	}
552
553	if ( $output !~ /<base/i ) {
554		$output =~ s/<head>/<head>\n<base href="$base">/i ||
555	  	$output =~ s/<html>/<html>\n<base href="$base">/i ;
556	}
557
558	print $query->header(-type=>'text/html',-nph=>1);
559	print $output;
560
561	unlink $file1;
562	unlink $file2;
563
564}
565
566$tmp1="/tmp/htdtmp1.$$";
567$headertmp1="/tmp/htdhtmp1.$$";
568$tmp2="/tmp/htdtmp2.$$";
569$headertmp2="/tmp/htdhtmp2.$$";
570$stripheader = 1;
571
572if (@ARGV == 0) {
573	cgi();		# if no arguments, we must be operating as a cgi script
574} else {
575	cli();		# if there are arguments, then we are operating as a CLI
576}
577
578unlink $tmp1;
579unlink $headertmp1;
580unlink $tmp2;
581unlink $headertmp2;
582