#!/usr/bin/perl -w

# Link maintenance toy for a website
# Waider 2000
use strict;
use HTML::Parser;
use GDBM_File;
use File::Find;
use File::Spec;
use File::Basename;
use Storable qw( freeze thaw );
use Getopt::Long;

my $p;
my $newdoc;
my $linktext;
my $href;
my $url;

my $script = 0;

# Parse document text chunk by chunk
my %linkfarm;

chdir "$ENV{'HOME'}/public_html" or die "You don't have a public_html directory\n";

my $db = tie %linkfarm, 'GDBM_File', "$ENV{'HOME'}/public_html/.linkfarm",
  &GDBM_WRCREAT, 0644 or die $!;

sub usage {
  print STDERR
	"usage:\n\t$0 [-dump] [-elisp] [-quiet] [-update] [<htmlfile>]\n";
  exit 1;
}

my ( $dump, $elisp, $silent, $update, $lookup ) = ( 0, 0, 0, 0, "" );
GetOptions( "dump" => \$dump,
		    "elisp" => \$elisp,
		    "quiet" => \$silent,
			"update" => \$update,
			"lookup:s" => \$lookup,
		  ) or usage();

# Dump the link farm in the form "phrase\0link1\0link2..."
if ( $dump ) {
  for ( sort keys %linkfarm ) {
	print $_, "\0", join( "\0", @{thaw $linkfarm{$_}}), "\n";
  }
  exit;
}

# new hack: output an alist directly.
if ( $elisp ) {
  print "(\n";
  for ( sort keys %linkfarm ) {
	print "(\"";
	print lisp_clean( $_ );
	print "\" \"";
	print lisp_clean( @{thaw $linkfarm{$_}});
	print "\")\n";
  }
  print ")\n";
  exit;
}

# lookup
if ( $lookup ) {
  shift;
  my $phrase = lc( $lookup );
  if ( defined( $linkfarm{$phrase})) {
	print "$lookup:\n  ";
	print join( "\n  ", @{thaw $linkfarm{$phrase}}), "\n"
  }
  exit;
}

# update
if ( $update ) {
  print "Scanning for files..." unless $silent;
  find( { wanted =>
		  sub { push @ARGV, $File::Find::name
				  if !-d $File::Find::name &&
					$File::Find::name =~ /\.html$/; },
		  no_chdir => 1 }, '.');
  print "done.\n" unless $silent;

  # Also, this is a full update, so flush what we have.
  for ( sort keys %linkfarm ) {
	delete $linkfarm{$_}; # is there a faster way to do this?
  }
}

# update 1 page
if ( $#ARGV == -1 ) {
#  "Can't find any HTML pages to scan.\nStopped";
  usage();
}

# Create parser object
$p = HTML::Parser->new(api_version => 3,
					   start_h =>
					   [\&start, "tagname, attr, text" ],
					   end_h   => [\&end,   "tagname, text" ],
					   text_h => [\&default, 'text' ],
					   default_h => [sub { $newdoc .= shift }, 'text' ],
					  );
$p->unbroken_text( 1 );

while ($#ARGV > -1 ) {
  print STDERR "Scanning file $ARGV[0] for links..." unless $silent;
  $newdoc = "";
  $linktext = "";
  $p->parse_file($ARGV[0]);
  print STDERR "done.\n" unless $silent;
  if ( $href > 0 ) {
	print STDERR "WARNING: " . $ARGV[0] . " has an unclosed A tag.\n";
	$href = 0;
  } elsif ( $href < 0 ) {
	print STDERR "WARNING: " . $ARGV[0] . " has an extra A close tag.\n";
	$href = 0;
  }
  shift;
}

undef $db;
untie %linkfarm;

# Subs.
sub normalise {
  my $url = shift;

  return if !defined( $url );
  return $url if $url =~ /^[a-z]+:/; # go back already
  return undef if $url =~ /^#/; # don't care about in-page bookmarks

  # Get absolute path to link
  my $cwd = dirname( File::Spec->rel2abs( $ARGV[ 0 ],
										  "$ENV{'HOME'}/public_html" ));

  $url =~ s|/~waider/||;

  if ( $url !~ /^\// ) {
	$url = File::Spec->rel2abs( $url, $cwd );
  }

  if ( -d $url ) {
	$url .= "/index.html";
  }

  my $tmpurl = $url;
  $tmpurl =~ s/#.*$//; # remove bookmarks
  if ( !-f $tmpurl ) {
	print STDERR "$url in $ARGV[0] is a dead link.\n" unless $silent;
	return undef; # ignore broken links!
  }

  # HAHAHAH. canonpath isn't REALLY canonical. Thanks for these, troc.
  1 while ($url =~ s{(^|/)\.(/|$)}{$1&&$2&&'/'}gex);        # cperl '});
  1 while ($url =~ s{(^|/)[^/]+?/\.\.(/|$)}{$1&&$2&&'/'}gex); # fix '});
  $url =~ s{([^:/])/+}{$1/}g;

  # Finally, make it relative to my web tree
  $url = File::Spec->abs2rel( $url, "$ENV{'HOME'}/public_html" );

  $url;
}

sub start {
  if ( $_[0] =~ /^a$/i ) {
	$url = normalise( $_[1]->{'href'} );
	$href++;
  }
  $newdoc .= $_[2];
}

sub end {
  my $cur;

  if ( $_[0] =~ /^a$/i ) {
	$linktext =~ s/[\n\r \t]+/ /gs;
	$linktext =~ s/^\s+//;
	$linktext =~ s/\s+$//;
	$linktext =~ s/"/&quot;/;
	$linktext = lc( $linktext );

	if ( length( $linktext ) > 1 ) {
	  $cur = $linkfarm{$linktext};
	  if ( defined( $cur )) {
		$cur = thaw $cur;
		if ( defined( $url )) {
		  if ( !grep /$url/, @$cur ) {
			push @$cur, $url;
			$linkfarm{$linktext} = freeze $cur;
		  }
		}
	  } else {
		if ( defined( $url )) {
		  $cur = [];
		  push @{$cur}, $url;
		  $linkfarm{$linktext} = freeze $cur;
		}
	  }
	}

	$linktext = "";
	$href = 0;
  }
  $newdoc .= $_[1];
}

# check if we should be saving text here.
sub default {
  if ( $href ) {
	$linktext .= $_[0];
  }
  $newdoc .= $_[0];
}

sub lisp_clean {
  my $string = shift;

  $string =~ s/"/\\"/g;

  $string;
}
