#!/usr/bin/perl -w

use HTML::Parser;
use URI;
use strict;
use warnings;
use Carp;
use Data::Dumper;

my $base=URI->new($ARGV[1]);

my %seen;
my @todo;
my %im_neg;

my $current_array;

# pull title out of an HTML page, by parsing it; end title
# handler used to pull parsing up short
sub file_title {
    my ($filename) = @_;
    if($filename eq "site_map.html") {
	return "(you are here)";
    }
    my $p = HTML::Parser->new(api_version => 3,
	     start_h => [\&title_start_handler, "self,tagname,attr"],
	     report_tags => [qw(title)],
	    );
    $p->parse_file($filename) || die "$filename $!";
    my $a =  $p->handler("text");
    my $title = join("", @$a);
    $title =~ s/^\s+//;
    $title =~ s/\s+$//;
    $title =~ s/\s+/ /g;
    return $title;
}

sub title_start_handler {
    my($self, $tag, $attr) = @_;

    $self->handler(text  => [], '@{dtext}' ); # accumulate text params into anon array
    $self->handler(end   => \&title_end_handler, "self"); # install the killer!
}

sub title_end_handler {
    my($self) = @_;
    $self->eof();
}

sub resolve {
    my ($path) = @_;
    return URI->new($path)->rel($base)->as_string();
}

sub path_is_abs {
    ($_) = @_;
    return m@/@ || m/:/;
}

sub local_not_html {
    my($path) = @_;
    my $thing = {
	title => $path,
	path => $path,
	array => [],
    };
    push @$current_array , $thing;
}

sub a_start_handler {
    my($self, $tag, $attr) = @_;
    my($path);

    if($tag eq "base") {
	    print "BASE $attr->{href}\n";
	    $base = URI->new($attr->{href});
    }
    if($tag eq "a" && defined($attr->{href})) {
	$path = resolve($attr->{href});
	if(!path_is_abs($path)) {
	    #local page, on this site
	    if($path !~ s/#.*$//g)	{ # suppress intra page anchors
		if($path =~ /\.html$/) {
		    if($path ne "index.html") {
			my $thing = {
			    title => file_title($path),
			    path => $path,
			    array => [],
			};
			push @$current_array , $thing;
			if(!exists($seen{$path})) {
			    push @todo, $thing;
			}
		    }
		} else {
		    # local file, probably JPG or PDF
		    local_not_html($path);
		}
	    }
	}
    }
    if($tag eq "img") {
	$path = resolve($attr->{src});
	# only link local images
	if(!path_is_abs($path) && $path !~ "thumb\." && !exists($im_neg{$path})) {
	    local_not_html($path);
	}
    }

}

sub do_file {
    my ($filename) = @_;
    if(!exists($seen{$filename})) {
	$seen{$filename} = 1;

	my $p = HTML::Parser->new(api_version => 3,
	    report_tags => [qw(base a img)],
	);

	$p->handler(start => \&a_start_handler, "self,tagname,attr");
	$p->parse_file($filename) || die $!;
    }
}

sub root {
    my ($start) = @_;
    my $start_thing = {
	path => $start,
	array => [],
    };
    push @todo, $start_thing;
    # classic breadth first algorithm, builds up
    # data against each page
    while(scalar(@todo) != 0) {
	my $thing = shift @todo;
	$current_array = $thing->{array};
	do_file($thing->{path});
    }
    # print Dumper($start_thing);
    return $start_thing;
}

# walks data built up
sub gen_html {
    my ($root) = @_;
    my $a = $root->{array};
    if(scalar(@$a) != 0) {
	printf("<ul>\n");
	foreach my $t (@$a) {
	    printf("<li><a href=\"%s\">%s</a></li>\n", 
		$t->{path}, $t->{title});
	    gen_html($t);
	}
	printf("</ul>\n");
    }
}

$seen{"site_map.html"} = 1;
$im_neg{"vi.made.with.vi.png"} = 1;
$im_neg{"getacro.png"} = 1;
$im_neg{"uk_wh_logo2.gif"} = 1;

my $root = root($ARGV[0]);

print <<END;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<title>Site Map</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> 
<meta name="Author" content="BugBear(Paul Womack)">
<meta name="description" content="automatically generated site map">
<meta name="keyword" content="site map sitemap">
</head>
<body bgcolor="#ffffff">
<h3>Site Map</h3>
END

gen_html($root);

print <<END;
<table>
<tr>
<td align=left width="50%">
<a href=index.html>back to home page</a><br>
</td>
<td align=left width="50%">
<img src="vi.made.with.vi.png" align=middle width=88 height=31 alt="made with VI"><br>
</td>
</tr>
</table>                            
</body>
</html>
END
