#!/usr/bin/perl

#    Cognition - A metadata browser
#    Copyright (c) 2008 Toby Inkster.
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


######################################################################
package Cognition::Parser;
######################################################################

use Data::Dumper;
use HTML::Entities;
use HTML::TreeBuilder;
use HTTP::Request;
use Sort::ArrayOfArrays;
use URI::URL;
use XML::DOM;
use XML::Parser;
use XML::XPath;
use strict;


BEGIN
{
	our $VERSION = 0.10;
	our $XDP = new XML::DOM::Parser('NoLWP' => 1);
}


sub new
{
	my $self     = shift;
	my $source   = shift;
	my $opts     = shift;
	
	my $this   = {
		'UA'       => $opts->{ua},
		'REQUEST'  => $opts->{request},
		'RESPONSE' => $opts->{response}
	};
	
	$this->{RESPONSE}->header('content-type') =~ /(xml)/i;
	my $xml = (length $1) ? 1 : 0;
	
	$this->{BASE} = $this->{RESPONSE}->base();

	if ($xml)
	{
		$this->{_xml} = 1;
		$this->{_source} = $source;
	}
	else
	{
		$this->{_xml} = 0;
		$this->{_source} = $self->html2xhtml($source);
		$this->{_raw_source} = $source;
	}
	
	my $xpp = XML::Parser->new(
		ErrorContext => 2,
		# ParseParamEnt => 1,
		NoLWP => 1
	);
	$xpp->setHandlers(
		ExternEnt => 0,
		ExternEntFin => 0		
	);
	$this->{DOM} = $Cognition::Parser::XDP->parse($this->{_source});
	$this->{XP}  = XML::XPath->new(xml => $this->{_source}, parser=>$xpp);

	bless($this);

	$this->parse_metadata();
	$this->parse_uformats();
	$this->parse_doc_structure();

	return $this;
	
} #/sub new


sub dumpText
{
	my $this = shift;
	return Dumper($this);
} #/sub dumpText


sub dumpTextClean
{
	my $this = shift;
	my $tmp  = {};
	$tmp->{DOM} = $this->{DOM};  $this->{DOM} = '...hidden...';
	$tmp->{XP} = $this->{XP};  $this->{XP} = '...hidden...';
	
	$Data::Dumper::Sortkeys = \&_cleanKeys;
	my $txt = $this->dumpText;
	
	$this->{DOM} = $tmp->{DOM};
	$this->{XP} = $tmp->{XP};

	return $txt;
} #/sub dumpTextClean


sub _cleanKeys
{
	my $hash = shift;
	my @keys = sort grep {/^[a-zA-Z]/} keys %{ $hash };
	return \@keys;
}


sub quoted
{
	return '"'.(shift).'"';
} #/sub quoted


sub dumpRDF
{
	my $this    = shift;
	my $triples = $this->triples;
	
	my %lookup;
	
	my $rv = "<?xml version=\"1.0\"?>\n"
		. "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"";
		
	foreach my $ns (keys %{ $this->{_metadata_ns} })
	{
		my $nsurl = $this->{_metadata_ns}->{$ns}->{nsurl};
		next unless (length $nsurl);
		$lookup{$nsurl} = lc($ns);
		
		$rv .= "\n\txmlns:".lc($ns)."=\"".encode_entities($nsurl)."\"";
	}
	$rv .= ">\n\n";
	
	my $current_target = undef;
	for (my $i=0; defined $triples->[$i]; $i++)
	{
		my ($target, $key, $value) = @{ $triples->[$i] };
		my $stripped_target;
		
		if ($target ne $current_target)
		{
			if (defined $current_target)
			{
				$rv .= "\t</rdf:Description>\n\n";
			}
		
			$current_target = $target;
			$stripped_target = $target;
			$stripped_target =~ s/(^\<|\>$)//g;
			
			$rv .= "\t<rdf:Description rdf:about=\"".encode_entities($stripped_target)."\">\n"
		}
		
		my $tag = $this->_fq2pfx($key);
		
		if ($value =~ /^\</)
		{
			$value =~ s/(^\<|\>$)//g;
			$rv .= "\t\t<$tag rdf:resource=\"".encode_entities($value)."\" />\n";
		}
		else
		{
			$value =~ s/(^\"|\"$)//g;
			$rv .= "\t\t<$tag>".encode_entities($value)."</$tag>\n";
		}
	}

	if (defined $current_target)
	{
		$rv .= "\t</rdf:Description>\n\n";
	}
	
	$rv .= "</rdf:RDF>\n";
	
	return $rv;

} #/sub dumpRDF


sub dumpTriples
{
	my @triples = shift->triples;
	return Dumper(\@triples);
} #/sub dumpTriples


sub triples
{
	my $this  = shift;
	my @rv;
	my %RV;
	my @metas = keys %{ $this->{Meta} };
	foreach my $m (@metas)
	{
		for (my $i=0; defined $this->{Meta}->{$m}->[$i]; $i++)
		{
			my $M = $this->{Meta}->{$m}->[$i];
			my ($target, $value);
			
			$target = $M->{_target} || '';
			$target = "<$target>";
			
			if ($M->{_attr} eq 'rel')
			{
				$value = '<'.$M->{href}.'>';
				
				if ($M->{title})
				{
					$RV{$value}{'<http://purl.org/dc/elements/1.1/title>'}{quoted($M->{title})} = 1;
				}
			}
			elsif ($M->{_attr} eq 'rev')
			{
				$value = '<>';
				
				if ($M->{title})
				{
					$RV{$value}{'<http://purl.org/dc/elements/1.1/title>'}{quoted($M->{title})} = 1;
				}
			}
			else
			{
				$value = quoted($M->{data});
			}
			
			$RV{$target}{$m}{$value} = 1 if ($m =~ /^\</);
		}
	}
	
	# Using a hash to begin with eliminates duplicates.
	foreach my $i (keys %RV)
	{
		foreach my $j (keys %{ $RV{$i} })
		{
			foreach my $k (keys %{ %{$RV{$i}}->{$j} })
			{
				push @rv, [($i, $j, $k)];
			}
		}
	}
	
	my $sort = Sort::ArrayOfArrays->new(
		sort_column => '0,1,2',
		results     => \@rv
	);
	
	return $sort->sort_it;
} #/sub triples


sub parse_rfc2731_namespaces
# Discover namespaces for metadata conforming to RFC 2731. Note
# that eRDF uses the same format for metadata schemas, so we
# can re-use these for eRDF too.
{
	my $this = shift;
	
	# We'll start by defining some namespaces for metadata.
	# The document is free to redefine them, but it seems to
	# work better if we define these by default.
	$this->{_metadata_ns} = {
		# Some people forget to manually specify these
		'DC' => {
			nsurl => 'http://purl.org/dc/elements/1.1/',
			title => 'Dublin Core 1.1'
		},
		'DCTERMS' => {
			nsurl => 'http://purl.org/dc/terms/',
			title => 'Dublin Core Terms'
		},
		# Pseudo-schema for <meta http-equiv>.
		'HTTP11' => {
			nsurl => 'urn:ietf:rfc:2616#',
			title => 'Hypertext Transfer Protocol 1.1'
		},
		# Metaschema for RFC2731 itself
		'SCHEMA' => {
			nsurl => 'urn:ietf:rfc:2731#',
			title => "The 'Schema' Metaschema"
		},
		# Defined by Atom, not strictly RFC2731-compliant, but can be
		# treated as such. (service.feed, service.edit, service.post)
		'SERVICE' => {
			nsurl => 'http://www.w3.org/2007/app#',
			title => 'Atom Publishing Protocol'
		},
		# Defined by OpenID, not strictly RFC2731-compliant, but can be
		# treated as such. (openid.server, openid.delegate)
		'OPENID' => {
			nsurl => 'http://openid.net/specs/openid-authentication-1_1.html#',
			title => 'OpenID 1.1'
		}
	};
	
	# Find custom namespaces. FOAF and other stuff goes here.
	my $links = $this->{XP}->find('//link');
	foreach my $l ($links->get_nodelist)
	{
		my $rel = $l->getAttribute('rel');
		
		if ($rel =~ /^schema[\.\:]([A-Z_][A-Z0-9_-]*)$/i)
		{
			my $prefix = uc($1);
			my $url    = $this->abs_url($l->getAttribute('href'));
			my $title  = $l->getAttribute('title');
			
			$this->{_metadata_ns}->{$prefix}->{nsurl} = $url;
			$this->{_metadata_ns}->{$prefix}->{title} = $title || "The '$prefix' Schema";
		}
	}
	
} #/sub parse_rfc2731_namespaces


sub parse_metadata
{
	my $this = shift;
	$this->parse_rfc2731_namespaces;
	
	# Parse links 
	my $links = $this->{XP}->find('//*[@rel]');
	foreach my $l ($links->get_nodelist)
	{
		my $rel = $l->getAttribute('rel');
		$rel =~ s/(^\s+|\s+$)//g;
		$rel =~ s/\s+/ /g;
		my @rel = split / /, $rel;
		
		foreach my $r (@rel)
		{
			my $nsrel = $this->_pfx2fq($r);
			
			push @{ $this->{Meta}->{$nsrel} },
				{
					title  => $l->getAttribute('title'),
					media  => $l->getAttribute('media'),
					data   => $l->getAttribute('href'),
					href   => $this->abs_url($l->getAttribute('href')),
					type   => $l->getAttribute('type'),
					lang   => $l->getAttribute('lang'),
					_meta  => $r,
					_xmeta => $nsrel,
					_xpath => $l,
					_tag   => $l->getName,
					_attr  => 'rel'
				};
		}
	}
	my $links = $this->{XP}->find('//*[@rev]');
	foreach my $l ($links->get_nodelist)
	{
		my $rev = $l->getAttribute('rev');
		$rev =~ s/(^\s+|\s+$)//g;
		$rev =~ s/\s+/ /g;
		my @rev = split / /, $rev;
		
		foreach my $r (@rev)
		{
			my $nsrev = $this->_pfx2fq($r);
			
			push @{ $this->{Meta}->{$nsrev} },
				{
					title  => $l->getAttribute('title'),
					media  => $l->getAttribute('media'),
					data   => $l->getAttribute('href'),
					href   => $this->abs_url($l->getAttribute('href')),
					type   => $l->getAttribute('type'),
					lang   => $l->getAttribute('lang'),
					rev    => $rev,
					_xmeta => $nsrev,
					_meta  => $r,
					_xpath => $l,
					_tag   => $l->getName,
					_attr  => 'rev',
					_target=> $this->abs_url($l->getAttribute('href'))
				};
		}
	}
	
	# Parse <meta>
	my $metas = $this->{XP}->find('//meta[@name]');
	foreach my $m ($metas->get_nodelist)
	{
		my $name = $m->getAttribute('name');
		$name =~ s/(^\s+|\s+$)//g;
		$name =~ s/\s+/ /g;
		my @name = split / /, $name;
		
		foreach my $n (@name)
		{
			my $nsname = $this->_pfx2fq($n);
			
			push @{ $this->{Meta}->{$nsname} },
				{
					data   => $m->getAttribute('content'),
					lang   => $m->getAttribute('lang'),
					scheme => $this->_pfx2fq($m->getAttribute('scheme')),
					name   => $name,
					_meta  => $n,
					_xmeta => $nsname,
					_xpath => $m,
					_tag   => $m->getName,
					_attr  => 'name'
				};
		}
	}
	
	# Parse <meta http-equiv>.
	my $metas = $this->{XP}->find('//meta[@http-equiv]');
	foreach my $m ($metas->get_nodelist)
	{
		my $name = $m->getAttribute('http-equiv');
		my $nsname = $this->_pfx2fq('HTTP11.'.$name);
		
		push @{ $this->{Meta}->{$nsname} },
			{
				data   => $m->getAttribute('content'),
				scheme => $this->_pfx2fq($m->getAttribute('scheme')),
				'http-equiv' => $name,
				_meta  => $name,
				_xmeta => $nsname,
				_xpath => $m,
				_tag   => $m->getName
			};
	}
	
	my @headers = $this->{RESPONSE}->header_field_names;
	foreach my $h (@headers)
	{
		my $nsname = $this->_pfx2fq('HTTP11.'.$h);
		my @vals = $this->{RESPONSE}->header($h);
		
		foreach my $v (@vals)
		{
			push @{ $this->{Meta}->{$nsname} },
				{
					data   => $v,
					'http-equiv' => $h,
					_meta  => $h,
					_xmeta => $nsname,
					_tag   => '_http_header'
				};	
		}
	}
	
	# Parse <title>.
	my $titles = $this->{XP}->find('//title');
	foreach my $t ($titles->get_nodelist)
	{
		push @{ $this->{Meta}->{'title'} },
			{
				data   => $t->string_value,
				lang   => $t->getAttribute('lang'),
				_meta  => 'title',
				_xmeta => 'title',
				_xpath => $t,
				_tag   => $t->getName
			};
	}
	
	# Parse HTML role="" attribute, filling _element, data, url (if _element=='a'), lang
	my $elements = $this->{XP}->find('//*[@role]');
	foreach my $e ($elements->get_nodelist)
	{
		my $role = $e->getAttribute('role');
		$role =~ s/(^\s+|\s+$)//g;
		$role =~ s/\s+/ /g;
		my @role = split / /, $role;
		
		foreach my $r (@role)
		{
			my $nsrole = $this->_pfx2fq($r);
			
			push @{ $this->{Meta}->{$nsrole} },
				{
					title  => $e->getAttribute('title'),
					lang   => $e->getAttribute('lang'),
					href   => $this->abs_url($e->getAttribute('href')),
					data   => $e->string_value,
					role   => $role,
					_meta  => $r,
					_xmeta => $nsrole,
					_xpath => $e,
					_tag   => $e->getName,
					_attr  => 'role'
				};
		}
	}
	
	# Parse eRDF <URL:http://research.talis.com/2005/erdf/wiki/Main/RdfInHtml>.
	# We do not support the specialised "rdf:type" notation though.
	my $elements = $this->{XP}->find('//*[@class]');
	foreach my $e ($elements->get_nodelist)
	{
		my $role = $e->getAttribute('class');
		$role =~ s/(^\s+|\s+$)//g;
		$role =~ s/\s+/ /g;
		my @role = split / /, $role;
		
		foreach my $r (@role)
		{
			my $nsrole = $this->_pfx2fq($r, '-');
			next if ($nsrole eq lc($r)); # Only allow namespaced classes.
			
			my $parent = $e;
			my $target = undef;
			while (defined $parent && !defined $target)
			{
				if ($parent->getAttribute('id'))
				{
					$target = $parent;
				}
				else
				{
					$parent = $parent->getParentNode();
				}
			}
			
			push @{ $this->{Meta}->{$nsrole} },
				{
					title  => $e->getAttribute('title'),
					lang   => $e->getAttribute('lang'),
					href   => $this->abs_url($e->getAttribute('href')),
					data   => $e->getAttribute('title') || $e->string_value,
					class  => $role,
					_meta  => $r,
					_xmeta => $nsrole,
					_xpath => $e,
					_tag   => $e->getName,
					_attr  => 'class',
					_target=> '#' . $parent->getAttribute('id')
				};
		}
	}
	
} #/sub parse_metadata


sub _pfx2fq
{
	my $this = shift;
	my $key  = shift;
	my $mode = shift || '.:';
	my ($ns, $term, $nsurl);
	
	# In "dash mode" allows the hyphen to be used as a namespace
	# prefix separator. This is for eRDF.
	if (($mode eq '-' && $key =~ /^([A-Z_][A-Z0-9_]*)[\.\:-](.+)$/i)
	# Normally, dash is allowed as part of the namespace prefix.    
	|| ($mode eq '.:' && $key =~ /^([A-Z_][A-Z0-9_-]*)[\.\:](.+)$/i))
	{
		$ns = uc($1);
		$term = lc($2);
		$nsurl = $this->{_metadata_ns}->{$ns}->{nsurl};
		
		# In eRDF, cope with undefined prefixes by assuming that
		# the class name was never intended as eRDF.
		if ($mode eq '-' && !$nsurl)
		{
			return $key;
		}
		
		# Otherwise, if an undefined prefix is used, complain quietly.
		$nsurl = 'http://undefined-namespace-prefix.invalid/' if (!$nsurl);
		
		return '<'.$nsurl.$term.'>';
	}

	# If no valid prefix is found
	return lc($key);

} #/sub _pfx2fq


sub _fq2pfx
{
	my $this = shift;
	my $key  = shift;
	my $mode = shift || ':';

	$key =~ s/(^\<|\>$)//g;
	$key =~ /^(.*[\#\/])([^\#\/]+)$/;

	my $nsurl = $1;
	$key      = $2;
	my $pfx;

	foreach my $k (keys %{ $this->{_metadata_ns} })
	{
		if ($this->{_metadata_ns}->{$k}->{nsurl} eq $nsurl)
		{
			$pfx = lc($k);
			last;
		}
	}

	return $pfx.$mode.$key;

} #/sub _fq2pfx


sub abs_url
{
	my $this = shift;
	my $url = url shift, $this->{BASE};
	return $url->abs->as_string;
} #/sub abs_url


sub parse_uformats
{
	my $this = shift;
	my (@tmp1, @tmp2, @tmp3, @tmp4);
	
	(@tmp1 = Cognition::Parser::hCard::parse_all($this)) && ($this->{uF}->{hCard} = \@tmp1);
	(@tmp2 = Cognition::Parser::adr::parse_all($this))   && ($this->{uF}->{adr} = \@tmp2);
	(@tmp3 = Cognition::Parser::geo::parse_all($this))   && ($this->{uF}->{geo} = \@tmp3);
	(@tmp4 = Cognition::Parser::hCalendarEvent::parse_all($this)) && ($this->{uF}->{hCalendarEvent} = \@tmp4);
	
	# hAtom
	# rel-tag
	# rel-license + CC license
	# rel-enclosure

} #/sub parse_uformats


sub parse_doc_structure
{

	# Create my own pseudo-uF: list of semantic tables (i.e. having <caption> or summary)
	# Create my own pseudo-uF: list of figures (i.e. <img class="fig" title="blah">)
	# Create my own pseudo-uF: list of semantic lists (i.e. <ul class="list" title="blah">)
	# Parse heading structure, including figures, tables and lists}

} #/sub parse_doc_structure


sub html2xhtml
{
	my $this = shift;
	my $html = shift;
	
	$html =~ s/\<\!DOCTYPE[^>]*\>//i;
	
	my $t = HTML::TreeBuilder->new();
	$t->implicit_tags(1);
	$t->p_strict(1);
	$t->parse_content($html);
	$t->{'xmlns'} = 'http://www.w3.org/1999/xhtml';
	
	my $xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
		. "<!DOCTYPE html\n"
		. "\tPUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
		. "\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
		. $t->as_XML(undef, "\t", {
			'area'=>1,
			'base'=>1,
			'basefont'=>1,
			'br'=>1,
			'col'=>1,
			'frame'=>1,
			'hr'=>1,
			'img'=>1,
			'input'=>1,
			'isindex'=>1,
			'link'=>1,
			'meta'=>1,
			'param'=>1
		});
	
	return $xhtml;
} #/sub html2xhtml


sub searchClass
{
	my $this   = shift;
	my $target = shift;
	my $dom    = shift || $this->{DOM};
	
	my $nodeList = $dom->getElementsByTagName('*');
	my @matches = ();
	for (my $i = 0; $i < $nodeList->getLength; $i++)
	{
		my $node = $nodeList->item($i);
		my $classList = $node->getAttribute('class');
		next unless (length $classList);
		
		if ($classList =~ / (^|\s) $target (\s|$) /x)
		{
			push @matches, $node;
		}
	}
	
	return @matches;
	
} #/sub searchClass


sub searchRel
{
	my $this   = shift;
	my $target = shift;
	my $dom    = shift || $this->{DOM};
	
	my $nodeList = $dom->getElementsByTagName('*');
	my @matches = ();
	for (my $i = 0; $i < $nodeList->getLength; $i++)
	{
		my $node = $nodeList->item($i);
		my $classList = $node->getAttribute('rel');
		next unless (length $classList);
		
		if ($classList =~ / (^|\s) $target (\s|$) /x)
		{
			push @matches, $node;
		}
	}
	
	return @matches;
	
} #/sub searchRel


sub searchID
{
	my $this   = shift;
	my $target = shift;
	my $dom    = shift || $this->{DOM};
	
	my $nodeList = $dom->getElementsByTagName('*');
	my @matches = ();
	for (my $i = 0; $i < $nodeList->getLength; $i++)
	{
		my $node = $nodeList->item($i);
		my $id   = $node->getAttribute('id') || next;
		
		if ($id eq $target)
		{
			return $node;
		}
	}
	
} #/sub searchID


1; #/package Cognition::Parser


######################################################################
package Cognition::Parser::Microformats;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.10;	
}


sub STRINGIFY
{
	my $domNode   = shift;
	my $withClass = shift || undef;
	my $inClass   = shift || 0;
	
	if (defined $withClass && !$inClass && $domNode->getNodeType == ELEMENT_NODE)
	{
		my $c = $domNode->getAttribute('class');
		$inClass = 1 if ($c =~ / (^|\s) $withClass (\s|$) /x);
	}
	
	if ($inClass || !defined $withClass)
	{
		if ($domNode->getNodeType == TEXT_NODE)
		{
			return $domNode->getData;
		}
		elsif ($domNode->getNodeType == ELEMENT_NODE && $domNode->getTagName eq 'img')
		{
			return $domNode->getAttribute('alt');
		}
		elsif ($domNode->getNodeType == ELEMENT_NODE)
		{
			my @parts;
			foreach my $child ($domNode->getChildNodes)
			{
				push @parts, STRINGIFY($child, $withClass, $inClass);
			}
			return join(' ', @parts);
		}
	}
}


sub ufp_abbr
# Implements the microformats ABBR pattern.
{
	my $node = shift;
	my $page = shift;
	my $elem = shift || 'abbr';
	my $attr = shift || 'title';
	my $rv = 0;
	
	my $abbrs = $node->getElementsByTagName($elem);
	for (my $i=0; $i < $abbrs->getLength; $i++)
	{
		# For each ABBR element...
		my $abbr = $abbrs->item($i);
		
		# Skip pattern if no title attribute found.
		my $title = $abbr->getAttribute($attr) || next;
		
		# Implement Andy Mabbett's suggested "data:" prefix
		if ($title =~ / [\(\[] data\: (.*) [\)\]] /x
		||  $title =~ / data\: (.*) $ /x )
		{
			$title = $1;
		}
		
		# Remove child nodes
		my $kids = $abbr->getChildNodes;
		for (my $j=0; $j < $kids->getLength; $j++)
		{
			$abbr->removeChild($kids->item($j));
		}
		
		# Insert contents of title attribute
		$abbr->addText($title) && $rv++;
	}
	
	# Return number of replacements made.
	return $rv;
	
} # /sub ufp_abbr


sub ufp_include
# Implements the standard microformats include pattern.
{
	my $node = shift;
	my $page = shift;
	my $class = shift || 'include';
	my $rv = 0;
	
	my $links = $node->getElementsByTagName('a');
	for (my $i=0; $i < $links->getLength; $i++)
	{
		# For each link...
		my $link = $links->item($i);
	
		# Skip pattern if no class attribute found.
		my $classList = $link->getAttribute('class') || next;
		
		# We've found a use of the include pattern
		if ($classList =~ / (^|\s) $class (\s|$) /x)
		{
			my $href = $link->getAttribute('href');
			my $id = undef;
			if ($href =~ /^\#(.*)$/)
			{
				$id = $1;
			}
			else
			{
				next;
			}
			
			my $replacement = $page->searchID($id)->cloneNode(1) || next;
			$link->getParentNode->replaceChild($replacement, $link) && $rv++;
		}
	}
	
	# Return number of replacements made.
	return $rv;
	
} # /sub ufp_include


sub ufp_include2
# Implements the alternative microformats include pattern.
{
	my $node = shift;
	my $page = shift;
	my $classpfx = shift || '#';
	my $rv = 0;
	
	# For each element...
	foreach my $elem ($node->getElementsByTagName('*'))
	{
		# Skip pattern if no class attribute found.
		my $classList = $elem->getAttribute('class');
		next unless ($classList =~ / $classpfx /x);
		
		my $atEnd = 0;
		
		$classList =~ s/(^\s|\s$)//g;
		$classList =~ s/\s+/ /g;
		my @classes = split / /, $classList;
		my @newClassList = ();
		
		foreach my $c (@classes)
		{
			if (substr($c,0,1) ne $classpfx && length($c)>1)
			{
				push @newClassList, $c;
				$atEnd = 1;
				next;
			}
			
			my $id = $c; $id =~ s/^\#//x;
			my $replacement = $page->searchID($id) || next;

			$replacement = $replacement->cloneNode(1);
			if ($atEnd)
			{
				$elem->appendChild($replacement) && $rv++;
			}
			else
			{
				$elem->insertBefore($replacement, $elem->getFirstChild) && $rv++;
			}
		}
		
		$elem->setAttribute('class', join(' ', @newClassList));
	}
	
	# Return number of replacements made.
	return $rv;
	
} # /sub ufp_include2


1; #/package Cognition::Parser::Microformats


######################################################################
package Cognition::Parser::hCard;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.10;
}


sub MAX_INCLUDE_ITERATIONS { 6; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('vcard', $within);
	my @rv;
	
	foreach my $card (@nodes)
	{
		my $hcard = parse($page, $card);
		push @rv, $hcard;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $hcard_node = $rv->{'_dom'}->cloneNode(1);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::Parser::Microformats::ufp_include($hcard_node, $page) 
		              + Cognition::Parser::Microformats::ufp_include2($hcard_node, $page);
		$incl_iterations++;
	}
	Cognition::Parser::Microformats::ufp_abbr($hcard_node, $page);

	# Pull out nested hcards to prevent them being parsed as within this hcard.
	# Check to see if they have an 'agent' class, and if so, parse them as an agent.
	my @nested = $page->searchClass('vcard', $hcard_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) agent (\s|$) /x)
		{
			push @{ $rv->{agent} }, parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}
	
	# TODO: rel-tag for categories.
	
	# Pull out embedded addresses.
	my @adrs = Cognition::Parser::adr::parse_all($page, $hcard_node);
	$rv->{adr} = \@adrs unless (!@adrs);

	# Pull out embedded locations.
	my @geos = Cognition::Parser::geo::parse_all($page, $hcard_node);
	$rv->{geo} = \@geos unless (!@geos);
	
	# Parse organisations.
	my @orgs = $page->searchClass('org', $hcard_node);
	foreach my $org (@orgs)
	{
		my @orgnames = $page->searchClass('organization-name', $org);
		my @orgunits = $page->searchClass('organization-unit', $org);
		my $parsed_org = {};
		
		if (!@orgnames)
		{
			$parsed_org = { 
				'organisation-unit' => undef,
				'organisation-name' => Cognition::Parser::Microformats::STRINGIFY($org)
			};
		}
		else
		{
			my $orgname = $orgnames[0];
			my $orgunit = $orgunits[0];
			$parsed_org = { 
				'organisation-unit' => Cognition::Parser::Microformats::STRINGIFY($orgname),
				'organisation-name' => Cognition::Parser::Microformats::STRINGIFY($orgunit)
			};
		}
		
		push @{ $rv->{org} }, $parsed_org;
	}
	
	# Simple attributes - singles
	my @simples = qw(fn bday tz sort-string uid class family-name given-name
		additional-name honorific-prefix honorific-suffix);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $hcard_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		
		my $value = Cognition::Parser::Microformats::STRINGIFY($value_node);
		
		if ($simple eq 'uid')
		{
			if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
				{ $value = $value_node->getAttribute('href'); }
			elsif ($value_node->getTagName eq 'img')
				{ $value = $value_node->getAttribute('src'); }
			elsif ($value_node->getTagName eq 'object')
				{ $value = $value_node->getAttribute('data'); }
		}
		
		$rv->{$simple} = $value;
	}
	
	# Simple attributes - possible duplicates
	my @simples = qw(nickname url label title role photo sound key logo mailer
		category note rev agent);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $hcard_node);
		foreach my $value_node (@value_nodes)
		{
			next unless ($value_node);
			
			my $value = Cognition::Parser::Microformats::STRINGIFY($value_node);
			
			if ($simple =~ /^(url|photo|logo|sound|key)$/)
			{
				if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
					{ $value = $value_node->getAttribute('href'); }
				elsif ($value_node->getTagName eq 'img')
					{ $value = $value_node->getAttribute('src'); }
				elsif ($value_node->getTagName eq 'object')
					{ $value = $value_node->getAttribute('data'); }
			}
			
			push @{$rv->{$simple}}, $value;
		}
	}
	
	# Telephone numbers and e-mail addresses
	my @comms = ('tel', 'email');
	foreach my $comm (@comms)
	{
		my @nodes = $page->searchClass($comm, $hcard_node);
		foreach my $n (@nodes)
		{
			my @accumValue;
			my @accumType;
			
			foreach my $v ($page->searchClass('value', $n))
			{
				if ($v->getTagName eq 'a' || $v->getTagName eq 'area')
				{
					my $href = $v->getAttribute('href');
					$href =~ s/(^[a-z]+\:|\?.*$)//g;
					push @accumValue, $href;
				}
				elsif ($v)
				{
					push @accumValue, Cognition::Parser::Microformats::STRINGIFY($v);
				}
			}
			
			foreach my $v ($page->searchClass('type', $n))
			{
				push @accumType, uc(Cognition::Parser::Microformats::STRINGIFY($v));
				$v->getParentNode->removeChild($v);
			}
			
			my $val = join '', @accumValue;
			if ($val !~ /[0-9a-z]/i)
			{
				$val = Cognition::Parser::Microformats::STRINGIFY($n);
			}
			
			push @{ $rv->{$comm} }, {
					'value' => $val,
					'type'  => \@accumType
				};
		}
		
	}
	
	# Check to see if this contact card is for an individual contact or an org.
	# If for an org, disable n-optimisation.
	my $do_n_opt = 1;
	$rv->{_type} = 'contact';
	foreach my $org (@{ $rv->{org} })
	{
		if (lc($org->{'organisation-name'}) eq lc($rv->{fn}))
		{
			$rv->{_type} = 'organisation';
			$do_n_opt = 0;
			last;
		}
	}

	# Perform n-optimisation.
	if ($do_n_opt)
	{
		foreach my $f (qw(family-name given-name additional-name honorific-prefix honorific-suffix))
		{
			if (length $rv->{$f})
			{
				$do_n_opt = 0;
			}
		}
		
		if ($do_n_opt)
		{
			$rv->{fn} =~ s/(^\s|\s$)//g;
			$rv->{fn} =~ s/\s+/ /g;
			
			my @words = split / /, $rv->{fn};
			
			if (scalar @words == 1)
			{
				$rv->{nickname} = $words[0];
			}
			elsif (scalar @words)
			{
				if ($words[0] =~ /^.*\,$/ || $words[1] =~ /^.\.?$/)
				{
					$words[0] =~ s/[\.\,]$//;
					$words[1] =~ s/[\.\,]$//;
					$rv->{'given-name'} = $words[1];
					$rv->{'family-name'} = $words[0];
				}
				else
				{
					$rv->{'given-name'} = $words[0];
					$rv->{'family-name'} = $words[-1];
				}
			}
		}
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::Parser::hCard


######################################################################
package Cognition::Parser::hCalendarEvent;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.10;
}


sub MAX_INCLUDE_ITERATIONS { 6; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('vevent', $within);
	my @rv;
	
	foreach my $ev (@nodes)
	{
		push @rv, parse($page, $ev);
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $event_node = $rv->{'_dom'}->cloneNode(1);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::Parser::Microformats::ufp_include($event_node, $page) 
		              + Cognition::Parser::Microformats::ufp_include2($event_node, $page);
		$incl_iterations++;
	}
	Cognition::Parser::Microformats::ufp_abbr($event_node, $page);

	# Pull out nested hcards to prevent them being parsed as within this event.
	# Check to see if they have a 'useful' class, and if so, parse them in.
	my @nested = $page->searchClass('vcard', $event_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) organiser (\s|$) /x)
		{
			push @{ $rv->{organiser} }, Cognition::Parser::hCard::parse($page, $h);
		}
		
		if ($h->getAttribute('class') =~ / (^|\s) contact (\s|$) /x)
		{
			push @{ $rv->{contact} }, Cognition::Parser::hCard::parse($page, $h);
		}
		
		if ($h->getAttribute('class') =~ / (^|\s) attendee (\s|$) /x)
		{
			push @{ $rv->{attendee} }, Cognition::Parser::hCard::parse($page, $h);
		}
		
		if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
		{
			push @{ $rv->{location} }, Cognition::Parser::hCard::parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}
	
	# Ditto for addresses
	my @nested = $page->searchClass('adr', $event_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
		{
			push @{ $rv->{location} }, Cognition::Parser::adr::parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}

	# Ditto for geo
	my @nested = $page->searchClass('geo', $event_node);
	foreach my $h (@nested)
	{
		if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
		{
			push @{ $rv->{location} }, Cognition::Parser::geo::parse($page, $h);
		}
		else
		{
			push @{ $rv->{geo} }, Cognition::Parser::geo::parse($page, $h);
		}
		
		$h->getParentNode->removeChild($h);
	}

	# TODO: rel-tag for categories.
	
	# Simple attributes - singles
	my @simples = qw(class description dtend dtstart duration location status
		summary uid url last-modified rdate rrule);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $event_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		
		my $value = Cognition::Parser::Microformats::STRINGIFY($value_node);

		if ($simple =~ /^(url|uid)$/)
		{
			if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
				{ $value = $value_node->getAttribute('href'); }
			elsif ($value_node->getTagName eq 'img')
				{ $value = $value_node->getAttribute('src'); }
			elsif ($value_node->getTagName eq 'object')
				{ $value = $value_node->getAttribute('data'); }
		}
			
		$rv->{$simple} = $value;
	}	
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::Parser::hCalendarEvent


######################################################################
package Cognition::Parser::adr;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.10;
}


sub MAX_INCLUDE_ITERATIONS { 2; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('adr', $within);
	my @rv;
	
	foreach my $adr (@nodes)
	{
		my $adr_parsed = parse($page, $adr);
		push @rv, $adr_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $adr_node = $rv->{'_dom'}->cloneNode(1);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::Parser::Microformats::ufp_include($adr_node, $page) 
		              + Cognition::Parser::Microformats::ufp_include2($adr_node, $page);
		$incl_iterations++;
	}
	Cognition::Parser::Microformats::ufp_abbr($adr_node, $page);
	
	# Simple attributes - possible duplicates
	my @simples = qw(post-office-box extended-address street-address locality
		region postal-code country-name type);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $adr_node);
		foreach my $value_node (@value_nodes)
		{
			my $value = Cognition::Parser::Microformats::STRINGIFY($value_node);
			push @{$rv->{$simple}}, $value;
		}
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::Parser::adr


######################################################################
package Cognition::Parser::geo;
######################################################################

use XML::DOM;
use strict;

BEGIN
{
	our $VERSION = 0.10;
}


sub MAX_INCLUDE_ITERATIONS { 2; }


sub parse_all
{
	my $page   = shift;
	my $within = shift || $page->{DOM};
	my @nodes = $page->searchClass('geo', $within);
	my @rv;
	
	foreach my $geo (@nodes)
	{
		my $geo_parsed = parse($page, $geo);
		push @rv, $geo_parsed;
	}
	
	return @rv;
} #/sub parse_all


sub parse
{
	my $page = shift;
	my $rv = { '_dom'=>shift };
	my $geo_node = $rv->{'_dom'}->cloneNode(1);
	
	# Expand microformat patterns.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
	{
		$replacements = Cognition::Parser::Microformats::ufp_include($geo_node, $page) 
		              + Cognition::Parser::Microformats::ufp_include2($geo_node, $page);
		$incl_iterations++;
	}
	Cognition::Parser::Microformats::ufp_abbr($geo_node, $page);
	
	# Simple attributes - singles
	my @simples = qw(longitude latitude body reference-frame altitude);	
	foreach my $simple (@simples)
	{
		my @value_nodes = $page->searchClass($simple, $geo_node);
		my $value_node  = $value_nodes[0];
		next unless ($value_node);
		my $value = Cognition::Parser::Microformats::STRINGIFY($value_node);
		$rv->{$simple} = $value;
	}
	
	if (!length($rv->{longitude}) || !length($rv->{latitude}))
	{
		my $str = $geo_node->toString;
		if ($geo_node->getTagName eq 'img' || $geo_node->getTagName eq 'area')
			{ $str = $geo_node->getAttribute('alt'); }
			
		$str =~ s/\,/\./g;
		if ($str =~ / \s* (\-?[0-9\.]+) \s* \; \s* (\-?[0-9\.]+) \s* /x)
		{
			$rv->{latitude}  = $1;
			$rv->{longitude} = $2;
		}
	}
	
	bless $rv;
	return $rv;
	
} #/sub parse


1; #/package Cognition::Parser::adr


######################################################################
package main;
######################################################################

use strict;
use Data::Dumper;
use HTTP::Request;

my $url      = shift @ARGV;

my $request  = HTTP::Request->new(GET => $url);
my $ua       = LWP::UserAgent->new;
my $response = $ua->request($request);
my $x        = Cognition::Parser->new(
		$response->content, 
		{
			ua        => $ua,
			response  => $response,
			request   => $request
		}
	);
print $x->dumpTextClean;