cognition/0.1-alpha2

#!/usr/bin/perl


######################################################################
#  Cognition/0.1-alpha2 - A metadata browser
#  Copyright (c) 2008 Toby Inkster.
######################################################################
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
######################################################################


######################################################################
#  Version History
######################################################################
#
#  cognition/0.1-alpha1 :-
#
#  - initial release
#  - metadata: <meta>, <link>, <title>, @role, eRDF
#      - eRDF does not support rdf:type syntax
#      - RFC 2731 is supported for namespaces
#  - microformats: hcard, hcalendar, adr, geo
#      - hcalendar support assumes page is one giant calendar
#      - no support for rel-tag, so no support for categories in hcard or
#        hcalendar
#      - geo support includes body, altitiude and reference-frame extensions           
#  - microformats patterns: include-pattern, abbr-pattern, extensions
#      - include-pattern supports my alternative syntax
#      - abbr-pattern supports Andy Mabbett's alternative
#  - RDF output of namespaced metadata
#    
#  cognition/0.1-alpha2 :-
#
#  - drop usage of XML::XPath module, using XML::DOM instead
#      - might use XML::DOM::XPath in future if XPath support is needed
#  - support XML namespaces used as metadata namespaces.
#  - microformats: hcalendar (complete), rel-tag, rel-license, figure, xoxo
#      - rel-licence extended to support searches for 'license' in CC or
#        DCTERMS namespaces; or 'rights.license' in DC or DCTERMS namespaces
#      - experimental figure microformat based on current brainstorming
#  - parse document structure (headings + semantic tables + semantic
#    images/figures microformat? + xoxo lists)
#
#  future work???
#
#  - add support for Person objects (hcard) to RDF output
#  - add support for Event objects to RDF output
#  - use <rdf:Bag> to wrap multiple tuples with the same subject and property
#  - microformats: hatom, hreview?
#  - support rel=meta to retrieve attitional document metadata, and parse RDF!
#  - RDFa?
#
######################################################################


######################################################################
package Cognition::HTMLParser;
######################################################################

use Data::Dumper;
use HTML::Entities;
use HTML::TreeBuilder;
use HTTP::Request;
use Sort::ArrayOfArrays;
use URI::URL;
use XML::DOM;
use XML::Parser;
#use XML::XPath;
use strict;


BEGIN
{
	our $VERSION = 0.1;
	our $XDP = new XML::DOM::Parser('NoLWP' => 1);
	$Data::Dumper::Indent = 1;
}


sub new
{
	my $self     = shift;
	my $source   = shift;
	my $opts     = shift;
	
	my $this   = {
		'UA'       => $opts->{ua},
		'REQUEST'  => $opts->{request},
		'RESPONSE' => $opts->{response}
	};
	
	$this->{RESPONSE}->header('content-type') =~ /(xml)/i;
	my $xml = (length $1) ? 1 : 0;
	
	$this->{BASE} = $this->{RESPONSE}->base();

	if (0 && $xml)
	{
		$this->{_xml} = 1;
		$this->{_source} = $source;
	}
	else
	{
		$this->{_xml} = 0;
		$this->{_source} = $self->html2xhtml($source);
		$this->{_raw_source} = $source;
	}
	
	#my $xpp = XML::Parser->new(
	#	ErrorContext => 2,
	#	# ParseParamEnt => 1,
	#	NoLWP => 1
	#);
	#$xpp->setHandlers(
	#	ExternEnt => 0,
	#	ExternEntFin => 0		
	#);
	#$this->{XP}  = XML::XPath->new(xml => $this->{_source}, parser=>$xpp);

	$this->{DOM} = $Cognition::HTMLParser::XDP->parse($this->{_source});

	bless($this);

	$this->parse_metadata();
	$this->parse_uformats();
	$this->parse_doc_structure();

	return $this;
	
} #/sub new


sub dumpText
{
	my $this = shift;
	return Dumper($this);
} #/sub dumpText


sub dumpTextClean
{
	my $this = shift;
	my $tmp  = {};
	$tmp->{DOM} = $this->{DOM};  $this->{DOM} = '...hidden...';
	# $tmp->{XP} = $this->{XP};  $this->{XP} = '...hidden...';
	
	$Data::Dumper::Sortkeys = \&_cleanKeys;
	my $txt = $this->dumpText;
	
	$this->{DOM} = $tmp->{DOM};
	# $this->{XP} = $tmp->{XP};

	return $txt;
} #/sub dumpTextClean


sub _cleanKeys
{
	my $hash = shift;
	my @keys = sort grep {/^(_subject|_id|[^_])/} keys %{ $hash };
	return \@keys;
}


sub quoted
{
	return '"'.(shift).'"';
} #/sub quoted


sub dumpRDF
{
	my $this    = shift;
	my $triples = $this->triples;
	
	my %lookup;
	
	my $rv = "<?xml version=\"1.0\"?>\n"
		. "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"";
		
	foreach my $ns (keys %{ $this->{metadata_ns} })
	{
		my $nsurl = $this->{metadata_ns}->{$ns}->{nsurl};
		next unless (length $nsurl);
		$lookup{$nsurl} = lc($ns);
		
		$rv .= "\n\txmlns:".lc($ns)."=\"".encode_entities($nsurl)."\"";
	}
	$rv .= ">\n\n";
	
	my $current_subject = undef;
	for (my $i=0; defined $triples->[$i]; $i++)
	{
		my ($subject, $key, $value) = @{ $triples->[$i] };
		my $stripped_subject;
		
		if ($subject ne $current_subject)
		{
			if (defined $current_subject)
			{
				$rv .= "\t</rdf:Description>\n\n";
			}
		
			$current_subject = $subject;
			$stripped_subject = $subject;
			$stripped_subject =~ s/(^\<|\>$)//g;
			
			$rv .= "\t<rdf:Description rdf:about=\"".encode_entities($stripped_subject)."\">\n"
		}
		
		my $tag = $this->_fq2pfx($key);
		
		if ($value =~ /^\</)
		{
			$value =~ s/(^\<|\>$)//g;
			$rv .= "\t\t<$tag rdf:resource=\"".encode_entities($value)."\" />\n";
		}
		else
		{
			$value =~ s/(^\"|\"$)//g;
			$rv .= "\t\t<$tag>".encode_entities($value)."</$tag>\n";
		}
	}

	if (defined $current_subject)
	{
		$rv .= "\t</rdf:Description>\n\n";
	}
	
	$rv .= "</rdf:RDF>\n";
	
	return $rv;

} #/sub dumpRDF


sub dumpTriples
{
	my @triples = shift->triples;
	return Dumper(\@triples);
} #/sub dumpTriples


sub triples
{
	my $this  = shift;
	my @rv;
	my %RV;
	my @metas = keys %{ $this->{Meta} };
	foreach my $m (@metas)
	{
		for (my $i=0; defined $this->{Meta}->{$m}->[$i]; $i++)
		{
			my $M = $this->{Meta}->{$m}->[$i];
			my ($subject, $value);
			
			$subject = $M->{_subject} || '';
			$subject = "<$subject>";
			
			if ($M->{_attr} eq 'rel')
			{
				$value = '<'.$M->{href}.'>';
				
				if ($M->{title})
				{
					$RV{$value}{'<http://purl.org/dc/elements/1.1/title>'}{quoted($M->{title})} = 1;
				}
			}
			elsif ($M->{_attr} eq 'rev')
			{
				$value = '<>';
				
				if ($M->{title})
				{
					$RV{$value}{'<http://purl.org/dc/elements/1.1/title>'}{quoted($M->{title})} = 1;
				}
			}
			else
			{
				$value = quoted($M->{data});
			}
			
			$RV{$subject}{$m}{$value} = 1 if ($m =~ /^\</);
		}
	}
	
	# Using a hash to begin with eliminates duplicates.
	foreach my $i (keys %RV)
	{
		foreach my $j (keys %{ $RV{$i} })
		{
			foreach my $k (keys %{ %{$RV{$i}}->{$j} })
			{
				push @rv, [($i, $j, $k)];
			}
		}
	}
	
	my $sort = Sort::ArrayOfArrays->new(
		sort_column => '0,1,2',
		results     => \@rv
	);
	
	return $sort->sort_it;
} #/sub triples


sub parse_rfc2731_namespaces
# Discover namespaces for metadata conforming to RFC 2731. Note
# that eRDF uses the same format for metadata schemas, so we
# can re-use these for eRDF too.
{
	my $this = shift;
	
	# We'll start by defining some namespaces for metadata.
	# The document is free to redefine them, but it seems to
	# work better if we define these by default.
	$this->{metadata_ns} = {
		# Some people forget to manually specify these
		'DC' => {
			nsurl => 'http://purl.org/dc/elements/1.1/',
			title => 'Dublin Core 1.1'
		},
		'DCTERMS' => {
			nsurl => 'http://purl.org/dc/terms/',
			title => 'Dublin Core Terms'
		},
		# Pseudo-schema for <meta http-equiv>.
		'HTTP11' => {
			nsurl => 'urn:ietf:rfc:2616#',
			title => 'Hypertext Transfer Protocol 1.1'
		},
		# Metaschema for RFC2731 itself
		'SCHEMA' => {
			nsurl => 'urn:ietf:rfc:2731#',
			title => "The 'Schema' Metaschema"
		},
		# Defined by Atom, not strictly RFC2731-compliant, but can be
		# treated as such. (service.feed, service.edit, service.post)
		'SERVICE' => {
			nsurl => 'http://www.w3.org/2007/app#',
			title => 'Atom Publishing Protocol'
		},
		# Defined by OpenID, not strictly RFC2731-compliant, but can be
		# treated as such. (openid.server, openid.delegate)
		'OPENID' => {
			nsurl => 'http://openid.net/specs/openid-authentication-1_1.html#',
			title => 'OpenID 1.1'
		}
	};
	
	foreach my $a ($this->{DOM}->getDocumentElement->getAttributes->getValues)
	{
		my $k = $a->getName;
		my $v = $a->getValue;
		next unless $k =~ /^xmlns\:/i;
		$k =~ s/^xmlns\://i;
	
		$this->{metadata_ns}->{$k} = {
			nsurl => $v,
			title => "The '$k' Namespace",
			xmlns => 1
		};

		$this->{xmlns}->{$k} = {
			nsurl => $v,
			title => "The '$k' Namespace"
		};
	}
	
	# Find custom RFC 2731 namespaces. FOAF and other stuff goes here.
	foreach my $l ($this->{DOM}->getElementsByTagName('link'))
	{
		my $rel = $l->getAttribute('rel');
		
		if ($rel =~ /^schema[\.\:]([A-Z_][A-Z0-9_-]*)$/i)
		{
			my $prefix = uc($1);
			my $url    = $this->abs_url($l->getAttribute('href'));
			my $title  = $l->getAttribute('title');
			
			$this->{metadata_ns}->{$prefix}->{nsurl} = $url;
			$this->{metadata_ns}->{$prefix}->{title} = $title || "The '$prefix' Schema";
		}
	}
	
} #/sub parse_rfc2731_namespaces


sub parse_metadata
{
	my $this = shift;
	$this->parse_rfc2731_namespaces;
	
	# Parse links 
	my @links   = $this->{DOM}->getElementsByTagName('link');
	my @anchors = $this->{DOM}->getElementsByTagName('a');
	push @links, @anchors;
	foreach my $l (@links)
	{
		my $rel = $l->getAttribute('rel');
		if (length $rel)
		{
			$rel =~ s/(^\s+|\s+$)//g;
			$rel =~ s/\s+/ /g;
			my @rel = split / /, $rel;
			
			foreach my $r (@rel)
			{
				my $nsrel = $this->_pfx2fq($r, undef, $l);
				
				push @{ $this->{Meta}->{$nsrel} },
					{
						title  => $l->getAttribute('title'),
						media  => $l->getAttribute('media'),
						data   => ($l->getTagName eq 'a') ? Cognition::HTMLParser::Utils::STRINGIFY($l) : $l->getAttribute('href'),
						href   => $this->abs_url($l->getAttribute('href')),
						type   => $l->getAttribute('type'),
						lang   => $l->getAttribute('lang'),
						_meta  => $r,
						_xmeta => $nsrel,
						_node  => $l,
						_tag   => $l->getTagName,
						_attr  => 'rel'
					};
			}
		}

		my $rev = $l->getAttribute('rev');
		if (length $rev)
		{
			$rev =~ s/(^\s+|\s+$)//g;
			$rev =~ s/\s+/ /g;
			my @rev = split / /, $rev;
			
			foreach my $r (@rev)
			{
				my $nsrev = $this->_pfx2fq($r, undef, $l);
				
				push @{ $this->{Meta}->{$nsrev} },
					{
						title  => $l->getAttribute('title'),
						media  => $l->getAttribute('media'),
						data   => ($l->getTagName eq 'a') ? Cognition::HTMLParser::Utils::STRINGIFY($l) : $l->getAttribute('href'),
						href   => $this->abs_url($l->getAttribute('href')),
						type   => $l->getAttribute('type'),
						lang   => $l->getAttribute('lang'),
						rev    => $rev,
						_xmeta => $nsrev,
						_meta  => $r,
						_node  => $l,
						_tag   => $l->getTagName,
						_attr  => 'rev',
						_subject=> $this->abs_url($l->getAttribute('href'))
					};
			}
		}
	}
	
	# Parse <meta>
	foreach my $m ($this->{DOM}->getElementsByTagName('meta'))
	{
		my $name = $m->getAttribute('http-equiv');
		if (length $name)
		{
			my $nsname = $this->_pfx2fq('HTTP11.'.$name);
		
			push @{ $this->{Meta}->{$nsname} },
				{
					data   => $m->getAttribute('content'),
					scheme => $this->_pfx2fq($m->getAttribute('scheme')),
					'http-equiv' => $name,
					_meta  => $name,
					_xmeta => $nsname,
					_node  => $m,
					_tag   => $m->getTagName,
					_attr  => 'http-equiv'
				};
		}
		else
		{
			$name = $m->getAttribute('name');
			next unless (length $name);
			$name =~ s/(^\s+|\s+$)//g;
			$name =~ s/\s+/ /g;
			my @name = split / /, $name;
			
			foreach my $n (@name)
			{
				my $nsname = $this->_pfx2fq($n, undef, $m);
				
				push @{ $this->{Meta}->{$nsname} },
					{
						data   => $m->getAttribute('content'),
						lang   => $m->getAttribute('lang'),
						scheme => $this->_pfx2fq($m->getAttribute('scheme')),
						name   => $name,
						_meta  => $n,
						_xmeta => $nsname,
						_node  => $m,
						_tag   => $m->getTagName,
						_attr  => 'name'
					};
			}
		}
	}	
	
	# Parse HTTP headers.
	my @headers = $this->{RESPONSE}->header_field_names;
	foreach my $h (@headers)
	{
		my $nsname = $this->_pfx2fq('HTTP11.'.$h);
		my @vals = $this->{RESPONSE}->header($h);
		
		foreach my $v (@vals)
		{
			push @{ $this->{Meta}->{$nsname} },
				{
					data   => $v,
					'http-equiv' => $h,
					_meta  => $h,
					_xmeta => $nsname,
					_tag   => '_http_header'
				};	
		}
	}
	
	# Parse <title>.
	foreach my $t ($this->{DOM}->getElementsByTagName('title'))
	{
		push @{ $this->{Meta}->{'title'} },
			{
				data   => Cognition::HTMLParser::Utils::STRINGIFY($t),
				lang   => $t->getAttribute('lang'),
				_meta  => 'title',
				_xmeta => 'title',
				_node  => $t,
				_tag   => $t->getTagName
			};
	}
	
	# Parse HTML role="" attribute, filling _element, data, url (if _element=='a'), lang	
	# Parse eRDF <URL:http://research.talis.com/2005/erdf/wiki/Main/RdfInHtml>.
	# We do not support the specialised "rdf:type" notation though.
	foreach my $e ($this->{DOM}->getElementsByTagName('*'))
	{
		my $role = $e->getAttribute('role');
		
		if (length $role)
		{
			$role =~ s/(^\s+|\s+$)//g;
			$role =~ s/\s+/ /g;
			my @role = split / /, $role;
			
			foreach my $r (@role)
			{
				my $nsrole = $this->_pfx2fq($r, undef, $e);
				
				push @{ $this->{Meta}->{$nsrole} },
					{
						title  => $e->getAttribute('title'),
						lang   => $e->getAttribute('lang'),
						href   => $this->abs_url($e->getAttribute('href')),
						data   => Cognition::HTMLParser::Utils::STRINGIFY($e),
						role   => $role,
						_meta  => $r,
						_xmeta => $nsrole,
						_node  => $e,
						_tag   => $e->getTagName,
						_attr  => 'role'
					};
			}
		}
		
		$role = $e->getAttribute('class');
		if (length($role))
		{
			$role =~ s/(^\s+|\s+$)//g;
			$role =~ s/\s+/ /g;
			my @role = split / /, $role;
			
			foreach my $r (@role)
			{
				my $nsrole = $this->_pfx2fq($r, '-', $e);
				next if ($nsrole eq lc($r)); # Only allow namespaced classes.
				
				my $parent = $e;
				my $subject = undef;
				while (!defined $subject && $parent->getNodeType==ELEMENT_NODE)
				{
					if ($parent->getAttribute('id'))
					{
						$subject = $parent;
					}
					else
					{
						$parent = $parent->getParentNode();
					}
				}
				
				push @{ $this->{Meta}->{$nsrole} },
					{
						title  => $e->getAttribute('title'),
						lang   => $e->getAttribute('lang'),
						href   => $this->abs_url($e->getAttribute('href')),
						data   => $e->getAttribute('title') || Cognition::HTMLParser::Utils::STRINGIFY($e),
						class  => $role,
						_meta  => $r,
						_xmeta => $nsrole,
						_node  => $e,
						_tag   => $e->getTagName,
						_attr  => 'class',
						_subject=> $parent->getNodeType==ELEMENT_NODE ? ('#' . $parent->getAttribute('id')) : ''
					};
			}
		}
	}
	
} #/sub parse_metadata


sub _pfx2fq
{
	my $this = shift;
	my $key  = shift;
	my $mode = shift;
	my $dom  = shift;
	my ($ns, $term, $nsurl);
	
	$mode = '.:' unless length $mode;
	
	# In "dash mode" allows the hyphen to be used as a namespace
	# prefix separator. This is for eRDF.
	if (($mode eq '-' && $key =~ /^([A-Z_][A-Z0-9_]*)[\.\:-](.+)$/i)
	# Normally, dash is allowed as part of the namespace prefix.    
	|| ($mode eq '.:' && $key =~ /^([A-Z_][A-Z0-9_-]*)[\.\:](.+)$/i))
	{
		$ns = $1;
		$term = $2;
		$nsurl = $this->{metadata_ns}->{uc($ns)}->{nsurl};
		
		# If we haven't found the namespace yet, then check xmlns:* attributes.
		if ($dom && !length $nsurl)
		{
			$nsurl = Cognition::HTMLParser::Utils::xmlns($ns, $dom);
			
			$this->{xmlns}->{$ns} = {
				nsurl   => $nsurl,
				xmlns   => 1
			};
		}
		
		# In eRDF, cope with undefined prefixes by assuming that
		# the class name was never intended as eRDF.
		if ($mode eq '-' && !$nsurl)
		{
			return $key;
		}
		
		# Otherwise, if an undefined prefix is used, complain quietly.
		$nsurl = 'http://undefined-namespace-prefix.invalid/' if (!$nsurl);
		
		return '<'.$nsurl.$term.'>';
	}

	# If no valid prefix is found, treat as case-insensitive default namespace
	return lc($key);

} #/sub _pfx2fq


sub _fq2pfx
{
	my $this = shift;
	my $key  = shift;
	my $mode = shift || ':';

	$key =~ s/(^\<|\>$)//g;
	$key =~ /^(.*[\#\/])([^\#\/]+)$/;

	my $nsurl = $1;
	$key      = $2;
	my $pfx;

	foreach my $k (keys %{ $this->{metadata_ns} })
	{
		if ($this->{metadata_ns}->{$k}->{nsurl} eq $nsurl)
		{
			$pfx = lc($k);
			return $pfx.$mode.$key;
		}
	}

	foreach my $k (keys %{ $this->{xmlns} })
	{
		if ($this->{xmlns}->{$k}->{nsurl} eq $nsurl)
		{
			$pfx = $k;
			return $pfx.$mode.$key;
		}
	}
	
	return $key;

} #/sub _fq2pfx


sub abs_url
{
	my $this = shift;
	my $url = url shift, $this->{BASE};
	return $url->abs->as_string;
} #/sub abs_url


sub parse_uformats
{
	my $this = shift;
	my (@tmp1, @tmp2, @tmp3, @tmp4, @tmp5, @tmp6);
	
	(@tmp1 = Cognition::HTMLParser::hCard::parse_all($this)) && ($this->{uF}->{hCard} = \@tmp1);
	(@tmp2 = Cognition::HTMLParser::adr::parse_all($this))   && ($this->{uF}->{adr} = \@tmp2);
	(@tmp3 = Cognition::HTMLParser::geo::parse_all($this))   && ($this->{uF}->{geo} = \@tmp3);
	(@tmp4 = Cognition::HTMLParser::Rel::Tag::parse_all($this)) && ($this->{uF}->{Tag} = \@tmp4);
	(@tmp5 = Cognition::HTMLParser::hCalendar::parse_all($this)) && ($this->{uF}->{hCalendar} = \@tmp5);
	(@tmp6 = Cognition::HTMLParser::Rel::License::parse_all($this)) && ($this->{uF}->{License} = \@tmp6);
	
	# hAtom
	# rel-license + CC license
	# rel-enclosure

} #/sub parse_uformats


sub parse_doc_structure
{

	my $this = shift;
	my $rv   = { kids=>() };
	my $lvl  = 1;
	my $ptr  = $rv;
	
	my $count_lists;
	my $count_tables;
	my $count_figures;

	# Create my own pseudo-uF: list of semantic tables (i.e. having <caption> or summary)
	# Create my own pseudo-uF: list of figures (i.e. <img class="figure" title="blah">)
	# Create my own pseudo-uF: list of semantic lists (i.e. <ul class="xoxo">)
	# Parse heading structure, including figures, tables and lists}

	foreach my $e ($this->{DOM}->getElementsByTagName('*'))
	{
		my $tag = $e->getTagName;
		
		if ($tag eq 'figure' || $e->getAttribute('class') =~ / (^|\s) figure (\s|$) /x)
		{
			my $node = {};
			push @{ $ptr->{kids}->[-1]->{kids} }, $node;
			$node->{tag} = 'figure';
			$node->{count} = ++$count_figures;
			$node->{figure} = Cognition::HTMLParser::figure::parse($this, $e);
			$node->{_dom} = $e;
		}
		
		elsif ($e->getAttribute('class') =~ / (^|\s) xoxo (\s|$) /x)
		{
			my $node = {};
			push @{ $ptr->{kids}->[-1]->{kids} }, $node;
			$node->{tag} = 'xoxo';
			$node->{count} = ++$count_lists;
			$node->{figure} = Cognition::HTMLParser::xoxo::parse($this, $e);
			$node->{_dom} = $e;
		}

		elsif ($tag =~ /^h([1-6])$/i)
		{
			my $thisLevel = $1;
			my $node = {};
			
			while ($thisLevel > $lvl)
			{
				my $parent = $ptr->{kids}->[-1];
				$parent->{up} = $ptr;
				$ptr = $parent;
				$lvl++;
			}
			while ($thisLevel < $lvl)
			{
				$lvl--;
				$ptr = $ptr->{up};
			}
			if ($thisLevel == $lvl)
			{
				push @{ $ptr->{kids} }, $node;
			}
			
			$node->{tag}     = $tag;
			$node->{_dom}    = $e;
			$node->{heading} = Cognition::HTMLParser::Utils::STRINGIFY($e);
		}
		
		elsif ( $tag eq 'table' && (length $e->getAttribute('summary') || defined $e->getElementsByTagName('caption')->item(0)) )
		{
			my $node = {};
			push @{ $ptr->{kids}->[-1]->{kids} }, $node;
			$node->{tag} = $tag;
			$node->{_dom} = $e;
			$node->{summary} = $e->getAttribute('summary');
			$node->{count} = ++$count_tables;
			my @captions = $e->getElementsByTagName('caption');
			if (@captions)
			{
				$node->{caption} = Cognition::HTMLParser::Utils::STRINGIFY($captions[0]);
			}
		}
	}

	$this->{Structure} = $rv;
	return $rv;	

} #/sub parse_doc_structure


sub html2xhtml
{
	my $this = shift;
	my $html = shift;
	
	$html =~ s/\<\!DOCTYPE[^>]*\>//i;
	
	my $t = HTML::TreeBuilder->new();
	$t->implicit_tags(1);
	$t->p_strict(1);
	$t->parse_content($html);
	$t->{'xmlns'} = 'http://www.w3.org/1999/xhtml';
	
	my $xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
		. "<!DOCTYPE html\n"
		. "\tPUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
		. "\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
		. $t->as_XML(undef, "\t", {
			'area'=>1,
			'base'=>1,
			'basefont'=>1,
			'br'=>1,
			'col'=>1,
			'frame'=>1,
			'hr'=>1,
			'img'=>1,
			'input'=>1,
			'isindex'=>1,
			'link'=>1,
			'meta'=>1,
			'param'=>1
		});
	
	return $xhtml;
} #/sub html2xhtml


sub searchClass
{
	my $this   = shift;
	my $target = shift;
	my $dom    = shift || $this->{DOM};
	
	my $nodeList = $dom->getElementsByTagName('*');
	my @matches = ();
	for (my $i = 0; $i < $nodeList->getLength; $i++)
	{
		my $node = $nodeList->item($i);
		my $classList = $node->getAttribute('class');
		next unless (length $classList);
		
		if ($classList =~ / (^|\s) $target (\s|$) /x)
		{
			push @matches, $node;
		}
	}
	
	return @matches;
	
} #/sub searchClass


sub searchRel
{
	my $this   = shift;
	my $target = shift;
	my $dom    = shift || $this->{DOM};
	
	$target =~ s/[\:\.]/\[\:\.\]/;
	
	my $nodeList = $dom->getElementsByTagName('*');
	my @matches = ();
	for (my $i = 0; $i < $nodeList->getLength; $i++)
	{
		my $node = $nodeList->item($i);
		my $classList = $node->getAttribute('rel');
		next unless (length $classList);
		
		if ($classList =~ / (^|\s) $target (\s|$) /ix)
		{
			push @matches, $node;
		}
	}
	
	return @matches;
	
} #/sub searchRel


sub searchID
{
	my $this   = shift;
	my $target = shift;
	my $dom    = shift || $this->{DOM};
	
	my $nodeList = $dom->getElementsByTagName('*');
	my @matches = ();
	for (my $i = 0; $i < $nodeList->getLength; $i++)
	{
		my $node = $nodeList->item($i);
		my $id   = $node->getAttribute('id') || next;
		
		if ($id eq $target)
		{
			return $node;
		}
	}
	
} #/sub searchID


1; #/package Cognition::HTMLParser


######################################################################
package Cognition::HTMLParser::Utils;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;	
}


sub STRINGIFY
{
	my $domNode   = shift;
	my $withClass = shift || undef;
	my $inClass   = shift || 0;
	
	if (defined $withClass && !$inClass && $domNode->getNodeType == ELEMENT_NODE)
	{
		my $c = $domNode->getAttribute('class');
		$inClass = 1 if ($c =~ / (^|\s) $withClass (\s|$) /x);
	}
	
	if ($inClass || !defined $withClass)
	{
		if ($domNode->getNodeType == TEXT_NODE)
		{
			return $domNode->getData;
		}
		elsif ($domNode->getNodeType == ELEMENT_NODE && $domNode->getTagName eq 'img')
		{
			return $domNode->getAttribute('alt');
		}
		elsif ($domNode->getNodeType == ELEMENT_NODE)
		{
			my @parts;
			foreach my $child ($domNode->getChildNodes)
			{
				push @parts, STRINGIFY($child, $withClass, $inClass);
			}
			return join(' ', @parts);
		}
	}
}


sub xmlns
{
	my $ns  = shift;
	my $dom = shift;
	
	while ($dom->getNodeType==ELEMENT_NODE)
	{
		if (length $dom->getAttribute("xmlns:$ns"))
		{
			return $dom->getAttribute("xmlns:$ns");
		}
		$dom = $dom->getParentNode;
	}
	
	if ($dom->getNodeType==DOCUMENT_NODE)
	{
		if (length $dom->getDocumentElement->getAttribute("xmlns:$ns"))
		{
			return $dom->getDocumentElement->getAttribute("xmlns:$ns");
		}
	}
	
	return undef;
}


1; #/package Cognition::HTMLParser::Utils


######################################################################
package Cognition::HTMLParser::Microformats;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;	
}


sub ufp_abbr
# Implements the microformats ABBR pattern.
{
	my $node = shift;
	my $page = shift;
	my $elem = shift || 'abbr';
	my $attr = shift || 'title';
	my $rv = 0;
	
	my $abbrs = $node->getElementsByTagName('*');
	for (my $i=0; $i < $abbrs->getLength; $i++)
	{
		# For each ABBR element...
		my $abbr = $abbrs->item($i);
	
		# Skip pattern if no title attribute found.
		my $title = $abbr->getAttribute($attr) || next;

		# ABBR element (or other element specified as third parameter) should
		# *always* be expanded. Other elements should only be expanded if the
		# title attribute (or other attribute, fourth parameter) includes the
		# string 'data:'.
		next
			unless ($abbr->getTagName eq $elem 
			|| $abbr->getAttribute($attr) =~ /data\:/);
		
		# Implement Andy Mabbett's suggested "data:" prefix
		if ($title =~ / [\(\[] data\: (.*) [\)\]] /x
		||  $title =~ / data\: (.*) $ /x )
		{
			$title = $1;
		}
		
		# Remove child nodes
		my $kids = $abbr->getChildNodes;
		for (my $j=0; $j < $kids->getLength; $j++)
		{
			$abbr->removeChild($kids->item($j));
		}
		
		# Insert contents of title attribute
		$abbr->addText($title) && $rv++;
	}
	
	# Return number of replacements made.
	return $rv;
	
} # /sub ufp_abbr


sub ufp_include
# Implements the standard microformats include pattern.
{
	my $node = shift;
	my $page = shift;
	my $class = shift || 'include';
	my $rv = 0;
	
	my $links = $node->getElementsByTagName('a');
	for (my $i=0; $i < $links->getLength; $i++)
	{
		# For each link...
		my $link = $links->item($i);
	
		# Skip pattern if no class attribute found.
		my $classList = $link->getAttribute('class') || next;
		
		# We've found a use of the include pattern
		if ($classList =~ / (^|\s) $class (\s|$) /x)
		{
			my $href = $link->getAttribute('href');
			my $id = undef;
			if ($href =~ /^\#(.*)$/)
			{
				$id = $1;
			}
			else
			{
				next;
			}
			
			my $replacement = $page->searchID($id)->cloneNode(1) || next;
			$link->getParentNode->replaceChild($replacement, $link) && $rv++;
		}
	}
	
	# Return number of replacements made.
	return $rv;
	
} # /sub ufp_include


sub ufp_include2
# Implements the alternative microformats include pattern.
{
	my $node = shift;
	my $page = shift;
	my $classpfx = shift || '#';
	my $rv = 0;
	
	# For each element...
	foreach my $elem ($node->getElementsByTagName('*'))
	{
		# Skip pattern if no class attribute found.
		my $classList = $elem->getAttribute('class');
		next unless ($classList =~ / $classpfx /x);
		
		my $atEnd = 0;
		
		$classList =~ s/(^\s|\s$)//g;
		$classList =~ s/\s+/ /g;
		my @classes = split / /, $classList;
		my @newClassList = ();
		
		foreach my $c (@classes)
		{
			if (substr($c,0,1) ne $classpfx && length($c)>1)
			{
				push @newClassList, $c;
				$atEnd = 1;
				next;
			}
			
			my $id = $c; $id =~ s/^\#//x;
			my $replacement = $page->searchID($id) || next;

			$replacement = $replacement->cloneNode(1);
			if ($atEnd)
			{
				$elem->appendChild($replacement) && $rv++;
			}
			else
			{
				$elem->insertBefore($replacement, $elem->getFirstChild) && $rv++;
			}
		}
		
		$elem->setAttribute('class', join(' ', @newClassList));
	}
	
	# Return number of replacements made.
	return $rv;
	
} # /sub ufp_include2


1; #/package Cognition::HTMLParser::Microformats


######################################################################
package Cognition::HTMLParser::hCard;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub MAX_INCLUDE_ITERATIONS { 6; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('vcard', $within);
	my @rv;
	
	foreach my $card (@nodes)
	{
		my $hcard = parse($page, $card);
		push @rv, $hcard;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $hcard_node = $rv->{'_dom'}->cloneNode(1);
	
	my $id = $hcard_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::HTMLParser::Microformats::ufp_include($hcard_node, $page) 
		              + Cognition::HTMLParser::Microformats::ufp_include2($hcard_node, $page);
		$incl_iterations++;
	}
	Cognition::HTMLParser::Microformats::ufp_abbr($hcard_node, $page);

	# Pull out nested hcards to prevent them being parsed as within this hcard.
	# Check to see if they have an 'agent' class, and if so, parse them as an agent.
	my @nested = $page->searchClass('vcard', $hcard_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) agent (\s|$) /x)
		{
			push @{ $rv->{agent} }, parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}
	
	# Categories
	my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $hcard_node);
	$rv->{category} = \@tags unless (!@tags);
	
	# Pull out embedded addresses.
	my @adrs = Cognition::HTMLParser::adr::parse_all($page, $hcard_node);
	$rv->{adr} = \@adrs unless (!@adrs);

	# Pull out embedded locations.
	my @geos = Cognition::HTMLParser::geo::parse_all($page, $hcard_node);
	$rv->{geo} = \@geos unless (!@geos);
	
	# Parse organisations.
	my @orgs = $page->searchClass('org', $hcard_node);
	foreach my $org (@orgs)
	{
		my @orgnames = $page->searchClass('organization-name', $org);
		my @orgunits = $page->searchClass('organization-unit', $org);
		my $parsed_org = {};
		
		if (!@orgnames)
		{
			$parsed_org = { 
				'organisation-unit' => undef,
				'organisation-name' => Cognition::HTMLParser::Utils::STRINGIFY($org)
			};
		}
		else
		{
			my $orgname = $orgnames[0];
			my $orgunit = $orgunits[0];
			$parsed_org = { 
				'organisation-unit' => Cognition::HTMLParser::Utils::STRINGIFY($orgname),
				'organisation-name' => Cognition::HTMLParser::Utils::STRINGIFY($orgunit)
			};
		}
		
		push @{ $rv->{org} }, $parsed_org;
	}
	
	# Simple attributes - singles
	my @simples = qw(fn bday tz sort-string uid class family-name given-name
		additional-name honorific-prefix honorific-suffix);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $hcard_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		
		my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
		
		if ($simple eq 'uid')
		{
			if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
				{ $value = $value_node->getAttribute('href'); }
			elsif ($value_node->getTagName eq 'img')
				{ $value = $value_node->getAttribute('src'); }
			elsif ($value_node->getTagName eq 'object')
				{ $value = $value_node->getAttribute('data'); }
		}
		
		$rv->{$simple} = $value;
	}
	
	# Simple attributes - possible duplicates
	my @simples = qw(nickname url label title role photo sound key logo mailer
		category note rev agent);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $hcard_node);
		foreach my $value_node (@value_nodes)
		{
			next unless ($value_node);
			
			my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
			
			if ($simple =~ /^(url|photo|logo|sound|key)$/)
			{
				if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
					{ $value = $value_node->getAttribute('href'); }
				elsif ($value_node->getTagName eq 'img')
					{ $value = $value_node->getAttribute('src'); }
				elsif ($value_node->getTagName eq 'object')
					{ $value = $value_node->getAttribute('data'); }
			}
			
			push @{$rv->{$simple}}, $value;
		}
	}
	
	# Telephone numbers and e-mail addresses
	my @comms = ('tel', 'email');
	foreach my $comm (@comms)
	{
		my @nodes = $page->searchClass($comm, $hcard_node);
		foreach my $n (@nodes)
		{
			my @accumValue;
			my @accumType;
			
			foreach my $v ($page->searchClass('value', $n))
			{
				if ($v->getTagName eq 'a' || $v->getTagName eq 'area')
				{
					my $href = $v->getAttribute('href');
					$href =~ s/(^[a-z]+\:|\?.*$)//g;
					push @accumValue, $href;
				}
				elsif ($v)
				{
					push @accumValue, Cognition::HTMLParser::Utils::STRINGIFY($v);
				}
			}
			
			foreach my $v ($page->searchClass('type', $n))
			{
				push @accumType, uc(Cognition::HTMLParser::Utils::STRINGIFY($v));
				$v->getParentNode->removeChild($v);
			}
			
			my $val = join '', @accumValue;
			if ($val !~ /[0-9a-z]/i)
			{
				$val = Cognition::HTMLParser::Utils::STRINGIFY($n);
			}
			
			push @{ $rv->{$comm} }, {
					'value' => $val,
					'type'  => \@accumType
				};
		}
		
	}
	
	# Check to see if this contact card is for an individual contact or an org.
	# If for an org, disable n-optimisation.
	my $do_n_opt = 1;
	$rv->{_type} = 'contact';
	foreach my $org (@{ $rv->{org} })
	{
		if (lc($org->{'organisation-name'}) eq lc($rv->{fn}))
		{
			$rv->{_type} = 'organisation';
			$do_n_opt = 0;
			last;
		}
	}

	# Perform n-optimisation.
	if ($do_n_opt)
	{
		foreach my $f (qw(family-name given-name additional-name honorific-prefix honorific-suffix))
		{
			if (length $rv->{$f})
			{
				$do_n_opt = 0;
			}
		}
		
		if ($do_n_opt)
		{
			$rv->{fn} =~ s/(^\s|\s$)//g;
			$rv->{fn} =~ s/\s+/ /g;
			
			my @words = split / /, $rv->{fn};
			
			if (scalar @words == 1)
			{
				$rv->{nickname} = $words[0];
			}
			elsif (scalar @words)
			{
				if ($words[0] =~ /^.*\,$/ || $words[1] =~ /^.\.?$/)
				{
					$words[0] =~ s/[\.\,]$//;
					$words[1] =~ s/[\.\,]$//;
					$rv->{'given-name'} = $words[1];
					$rv->{'family-name'} = $words[0];
				}
				else
				{
					$rv->{'given-name'} = $words[0];
					$rv->{'family-name'} = $words[-1];
				}
			}
		}
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::hCard


######################################################################
package Cognition::HTMLParser::hCalendar;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub MAX_INCLUDE_ITERATIONS { 2; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('hcalendar', $within);
	my @rv;
	
	if (!@nodes && $within->getNodeType==DOCUMENT_NODE)
	{
		@nodes = ($within->getDocumentElement);
	}
	elsif (!@nodes)
	{
		@nodes = ($within);
	}
	
	foreach my $hc (@nodes)
	{
		my $hc_parsed = parse($page, $hc);
		push @rv, $hc_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $hc_node = $rv->{'_dom'}->cloneNode(1);

	my $id = $hc_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::HTMLParser::Microformats::ufp_include($hc_node, $page) 
		              + Cognition::HTMLParser::Microformats::ufp_include2($hc_node, $page);
		$incl_iterations++;
	}
	Cognition::HTMLParser::Microformats::ufp_abbr($hc_node, $page);
	
	# Pull out nested hevents.
	my @events = Cognition::HTMLParser::hCalendarEvent::parse_all($page, $hc_node);
	$rv->{vevent} = \@events unless (!@events);
	
	# Simple attributes - singles
	my @simples = qw(version prodid);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $hc_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
		$rv->{$simple} = $value;
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::hCalendar


######################################################################
package Cognition::HTMLParser::hCalendarEvent;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub MAX_INCLUDE_ITERATIONS { 6; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('vevent', $within);
	my @rv;
	
	foreach my $ev (@nodes)
	{
		push @rv, parse($page, $ev);
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $event_node = $rv->{'_dom'}->cloneNode(1);
	
	my $id = $event_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::HTMLParser::Microformats::ufp_include($event_node, $page) 
		              + Cognition::HTMLParser::Microformats::ufp_include2($event_node, $page);
		$incl_iterations++;
	}
	Cognition::HTMLParser::Microformats::ufp_abbr($event_node, $page);

	# Pull out nested hcards to prevent them being parsed as within this event.
	# Check to see if they have a 'useful' class, and if so, parse them in.
	my @nested = $page->searchClass('vcard', $event_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) organiser (\s|$) /x)
		{
			push @{ $rv->{organiser} }, Cognition::HTMLParser::hCard::parse($page, $h);
		}
		
		if ($h->getAttribute('class') =~ / (^|\s) contact (\s|$) /x)
		{
			push @{ $rv->{contact} }, Cognition::HTMLParser::hCard::parse($page, $h);
		}
		
		if ($h->getAttribute('class') =~ / (^|\s) attendee (\s|$) /x)
		{
			push @{ $rv->{attendee} }, Cognition::HTMLParser::hCard::parse($page, $h);
		}
		
		if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
		{
			push @{ $rv->{location} }, Cognition::HTMLParser::hCard::parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}
	
	# Ditto for addresses
	my @nested = $page->searchClass('adr', $event_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
		{
			push @{ $rv->{location} }, Cognition::HTMLParser::adr::parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}

	# Ditto for geo
	my @nested = $page->searchClass('geo', $event_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
		{
			push @{ $rv->{location} }, Cognition::HTMLParser::geo::parse($page, $h);
		}
		else
		{
			push @{ $rv->{geo} }, Cognition::HTMLParser::geo::parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}

	# Categories
	my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $event_node);
	$rv->{category} = \@tags unless (!@tags);	
	
	# Simple attributes - singles
	my @simples = qw(class description dtend dtstart duration location status
		summary uid url last-modified rdate rrule);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $event_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		
		my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);

		if ($simple =~ /^(url|uid)$/)
		{
			if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
				{ $value = $value_node->getAttribute('href'); }
			elsif ($value_node->getTagName eq 'img')
				{ $value = $value_node->getAttribute('src'); }
			elsif ($value_node->getTagName eq 'object')
				{ $value = $value_node->getAttribute('data'); }
		}
			
		$rv->{$simple} = $value;
	}	
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::hCalendarEvent


######################################################################
package Cognition::HTMLParser::adr;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub MAX_INCLUDE_ITERATIONS { 2; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('adr', $within);
	my @rv;
	
	foreach my $adr (@nodes)
	{
		my $adr_parsed = parse($page, $adr);
		push @rv, $adr_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $adr_node = $rv->{'_dom'}->cloneNode(1);

	my $id = $adr_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::HTMLParser::Microformats::ufp_include($adr_node, $page) 
		              + Cognition::HTMLParser::Microformats::ufp_include2($adr_node, $page);
		$incl_iterations++;
	}
	Cognition::HTMLParser::Microformats::ufp_abbr($adr_node, $page);
	
	# Simple attributes - possible duplicates
	my @simples = qw(post-office-box extended-address street-address locality
		region postal-code country-name type);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $adr_node);
		foreach my $value_node (@value_nodes)
		{
			my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
			push @{$rv->{$simple}}, $value;
		}
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::adr


######################################################################
package Cognition::HTMLParser::geo;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub MAX_INCLUDE_ITERATIONS { 2; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('geo', $within);
	my @rv;
	
	foreach my $geo (@nodes)
	{
		my $geo_parsed = parse($page, $geo);
		push @rv, $geo_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $geo_node = $rv->{'_dom'}->cloneNode(1);

	my $id = $geo_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::HTMLParser::Microformats::ufp_include($geo_node, $page) 
		              + Cognition::HTMLParser::Microformats::ufp_include2($geo_node, $page);
		$incl_iterations++;
	}
	Cognition::HTMLParser::Microformats::ufp_abbr($geo_node, $page);
	
	# Simple attributes - singles
	my @simples = qw(longitude latitude body reference-frame altitude);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $geo_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
		$rv->{$simple} = $value;
	}
	
	if (!length($rv->{longitude}) || !length($rv->{latitude}))
	{
		my $str = $geo_node->toString;
		if ($geo_node->getTagName eq 'img' || $geo_node->getTagName eq 'area')
			{ $str = $geo_node->getAttribute('alt'); }
			
		$str =~ s/\,/\./g;
		if ($str =~ / \s* (\-?[0-9\.]+) \s* \; \s* (\-?[0-9\.]+) \s* /x)
		{
			$rv->{latitude}  = $1;
			$rv->{longitude} = $2;
		}
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::geo


######################################################################
package Cognition::HTMLParser::figure;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub MAX_INCLUDE_ITERATIONS { 2; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('figure', $within);
	my @rv;
	
	foreach my $figure (@nodes)
	{
		my $figure_parsed = parse($page, $figure);
		push @rv, $figure_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $figure_node = $rv->{'_dom'}->cloneNode(1);
	$rv->{_dom} = undef;

	my $id = $figure_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::HTMLParser::Microformats::ufp_include($figure_node, $page) 
		              + Cognition::HTMLParser::Microformats::ufp_include2($figure_node, $page);
		$incl_iterations++;
	}
	Cognition::HTMLParser::Microformats::ufp_abbr($figure_node, $page);
	
	# Extract embedded hCards, which may be used as credits
	my @nested = $page->searchClass('hcard', $figure_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) credit (\s|$) /x)
			{ push @{ $rv->{credit} }, Cognition::HTMLParser::hcard::parse($page, $h); }	
		$h->getParentNode->removeChild($h);
	}

	# Additional credits	
	my @value_nodes = $page->searchClass('credit', $figure_node);
	foreach my $value_node (@value_nodes)
	{
		my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
		push @{ $rv->{'credit'} }, $value;
	}

	# Legend
	my @value_nodes = $page->searchClass('legend', $figure_node);
	my $value_node  = $value_nodes[0];
	if ($value_node)
	{
		my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
		$value = $value_node->getAttribute('title') if ($value_node->getTagName eq 'img');
		$rv->{'legend'} = $value;
	}

	# Categories
	my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $figure_node);
	$rv->{category} = \@tags unless (!@tags);	
	
	# Find image
	my $img;
	if ($figure_node->getTagName eq 'img')
	{
		$img = $figure_node; 
	}
	else
	{
		my @images = $figure_node->getElementsByTagName('img');
		$img = $images[0];
	}
	
	$rv->{image} = $page->abs_url($img->getAttribute('src'));
	$rv->{title} = $img->getAttribute('title') unless (length $rv->{'legend'});
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::figure


######################################################################
package Cognition::HTMLParser::Rel::Tag;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchRel('tag', $within);
	my @rv;
	
	foreach my $t (@nodes)
	{
		my $t_parsed = parse($page, $t);
		push @rv, $t_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $t_node = $rv->{'_dom'}->cloneNode(1);

	my $id = $t_node->getAttribute('id');
	$rv->{_id} = $id if (length $id);
	
	my $ts;
	my $tag = $t_node->getAttribute('href');
	$tag =~ s/\#.*$//;
	$tag =~ s/\?.*$//;
	$tag =~ s/\/$//;
	
	if ($tag =~ m{^(.*/)([^/]+)$})
	{
		$rv->{tagspace} = $1;
		$rv->{tag}      = $2;
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::Rel::Tag


######################################################################
package Cognition::HTMLParser::Rel::License;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @rv;
	
	# Unqualified 'license' rel.
	my @nodes = $page->searchRel('license', $within);
	
	# Various namespace-qualified alternatives...
	push @nodes, $page->searchRel($page->_fq2pfx('http://web.resource.org/cc/license'), $within);
	push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/terms/license'), $within);
	push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/terms/rights.license'), $within);
	push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/elements/1.1/rights.license'), $within);
	
	foreach my $l (@nodes)
	{
		my $l_parsed = parse($page, $l);
		push @rv, $l_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $t_node = $rv->{'_dom'}->cloneNode(1);
	$rv->{'license'} = $t_node->getAttribute('href');
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::Rel::License


######################################################################
package Cognition::HTMLParser::xoxo;
######################################################################

use XML::DOM;
use XML::XOXO;
use strict;

BEGIN
{
	our $VERSION = 0.1;
}


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @rv;
	
	my @nodes = $page->searchClass('xoxo', $within);
	foreach my $l (@nodes)
	{
		my $l_parsed = parse($page, $l);
		push @rv, $l_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $list_node = $rv->{'_dom'}->cloneNode(1);
	
	my $parser = XML::XOXO::Parser->new();
	$rv->{'object'} = $parser->parse($list_node->toString);
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::HTMLParser::xoxo


######################################################################
package main;
######################################################################

use strict;
use CGI qw(:standard);
use CGI::Carp 'fatalsToBrowser';
use Data::Dumper;
use LWP::UserAgent;
use LWP::RobotUA;
use HTTP::Request;

my $q = new CGI;

if ($q->request_method =~ /^(GET|POST|HEAD)$/)
{
	print &header;
	
	print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n";
	print "<title>cognition/0.1-alpha2</title>\n";
	print "<link href=\"cognition.css\" rel=\"stylesheet\" type=\"text/css\">\n";
	print "<h1>cognition/0.1-alpha2</h1>\n";
	print "<form method=\"get\" action=\"#\">\n";
	print " <fieldset>\n";
	print "  <legend>Analyse semantics of a page</legend>\n";
	print "  <label for=\"url\">URL:</label>\n";
	print "  <input id=\"url\" name=\"url\">\n";
	print "  <input id=\"submit\" type=\"submit\">\n";
	print " </fieldset>\n";
	print "</form>\n";
	
	my $url = $q->param('url');
	if (length $url)
	{
		print "<pre>\n";

		my $ua = LWP::RobotUA->new(
			agent      => 'cognition/0.1-alpha2 ',
			from       => 'invalid@invalid.invalid',
			delay      => 0
		);

		my $request  = HTTP::Request->new(GET => $url);
		my $response = $ua->request($request);
		my $x        = Cognition::HTMLParser->new(
				$response->content, 
				{
					ua        => $ua,
					response  => $response,
					request   => $request
				}
			);
		print escapeHTML($x->dumpTextClean);
		print "</pre>\n";
	}
	else
	{
		print "<p class=\"warn\">No URL entered!</p>\n";
	}
}

else
{
	my $ua = LWP::UserAgent->new(
		agent      => 'cognition/0.1-alpha2 '
	);

	my $url      = shift @ARGV;
	my $request  = HTTP::Request->new(GET => $url);
	my $response = $ua->request($request);
	my $x        = Cognition::HTMLParser->new(
			$response->content, 
			{
				ua        => $ua,
				response  => $response,
				request   => $request
			}
		);
	print $x->dumpTextClean;
}