#!/usr/bin/perl
######################################################################
# Cognition/0.1-alpha2 - A metadata browser
# Copyright (c) 2008 Toby Inkster.
######################################################################
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
######################################################################
######################################################################
# Version History
######################################################################
#
# cognition/0.1-alpha1 :-
#
# - initial release
# - metadata: , ,
, @role, eRDF
# - eRDF does not support rdf:type syntax
# - RFC 2731 is supported for namespaces
# - microformats: hcard, hcalendar, adr, geo
# - hcalendar support assumes page is one giant calendar
# - no support for rel-tag, so no support for categories in hcard or
# hcalendar
# - geo support includes body, altitiude and reference-frame extensions
# - microformats patterns: include-pattern, abbr-pattern, extensions
# - include-pattern supports my alternative syntax
# - abbr-pattern supports Andy Mabbett's alternative
# - RDF output of namespaced metadata
#
# cognition/0.1-alpha2 :-
#
# - drop usage of XML::XPath module, using XML::DOM instead
# - might use XML::DOM::XPath in future if XPath support is needed
# - support XML namespaces used as metadata namespaces.
# - microformats: hcalendar (complete), rel-tag, rel-license, figure, xoxo
# - rel-licence extended to support searches for 'license' in CC or
# DCTERMS namespaces; or 'rights.license' in DC or DCTERMS namespaces
# - experimental figure microformat based on current brainstorming
# - parse document structure (headings + semantic tables + semantic
# images/figures microformat? + xoxo lists)
#
# future work???
#
# - add support for Person objects (hcard) to RDF output
# - add support for Event objects to RDF output
# - use to wrap multiple tuples with the same subject and property
# - microformats: hatom, hreview?
# - support rel=meta to retrieve attitional document metadata, and parse RDF!
# - RDFa?
#
######################################################################
######################################################################
package Cognition::HTMLParser;
######################################################################
use Data::Dumper;
use HTML::Entities;
use HTML::TreeBuilder;
use HTTP::Request;
use Sort::ArrayOfArrays;
use URI::URL;
use XML::DOM;
use XML::Parser;
#use XML::XPath;
use strict;
BEGIN
{
our $VERSION = 0.1;
our $XDP = new XML::DOM::Parser('NoLWP' => 1);
$Data::Dumper::Indent = 1;
}
sub new
{
my $self = shift;
my $source = shift;
my $opts = shift;
my $this = {
'UA' => $opts->{ua},
'REQUEST' => $opts->{request},
'RESPONSE' => $opts->{response}
};
$this->{RESPONSE}->header('content-type') =~ /(xml)/i;
my $xml = (length $1) ? 1 : 0;
$this->{BASE} = $this->{RESPONSE}->base();
if (0 && $xml)
{
$this->{_xml} = 1;
$this->{_source} = $source;
}
else
{
$this->{_xml} = 0;
$this->{_source} = $self->html2xhtml($source);
$this->{_raw_source} = $source;
}
#my $xpp = XML::Parser->new(
# ErrorContext => 2,
# # ParseParamEnt => 1,
# NoLWP => 1
#);
#$xpp->setHandlers(
# ExternEnt => 0,
# ExternEntFin => 0
#);
#$this->{XP} = XML::XPath->new(xml => $this->{_source}, parser=>$xpp);
$this->{DOM} = $Cognition::HTMLParser::XDP->parse($this->{_source});
bless($this);
$this->parse_metadata();
$this->parse_uformats();
$this->parse_doc_structure();
return $this;
} #/sub new
sub dumpText
{
my $this = shift;
return Dumper($this);
} #/sub dumpText
sub dumpTextClean
{
my $this = shift;
my $tmp = {};
$tmp->{DOM} = $this->{DOM}; $this->{DOM} = '...hidden...';
# $tmp->{XP} = $this->{XP}; $this->{XP} = '...hidden...';
$Data::Dumper::Sortkeys = \&_cleanKeys;
my $txt = $this->dumpText;
$this->{DOM} = $tmp->{DOM};
# $this->{XP} = $tmp->{XP};
return $txt;
} #/sub dumpTextClean
sub _cleanKeys
{
my $hash = shift;
my @keys = sort grep {/^(_subject|_id|[^_])/} keys %{ $hash };
return \@keys;
}
sub quoted
{
return '"'.(shift).'"';
} #/sub quoted
sub dumpRDF
{
my $this = shift;
my $triples = $this->triples;
my %lookup;
my $rv = "\n"
. "{metadata_ns} })
{
my $nsurl = $this->{metadata_ns}->{$ns}->{nsurl};
next unless (length $nsurl);
$lookup{$nsurl} = lc($ns);
$rv .= "\n\txmlns:".lc($ns)."=\"".encode_entities($nsurl)."\"";
}
$rv .= ">\n\n";
my $current_subject = undef;
for (my $i=0; defined $triples->[$i]; $i++)
{
my ($subject, $key, $value) = @{ $triples->[$i] };
my $stripped_subject;
if ($subject ne $current_subject)
{
if (defined $current_subject)
{
$rv .= "\t\n\n";
}
$current_subject = $subject;
$stripped_subject = $subject;
$stripped_subject =~ s/(^\<|\>$)//g;
$rv .= "\t\n"
}
my $tag = $this->_fq2pfx($key);
if ($value =~ /^\)
{
$value =~ s/(^\<|\>$)//g;
$rv .= "\t\t<$tag rdf:resource=\"".encode_entities($value)."\" />\n";
}
else
{
$value =~ s/(^\"|\"$)//g;
$rv .= "\t\t<$tag>".encode_entities($value)."$tag>\n";
}
}
if (defined $current_subject)
{
$rv .= "\t\n\n";
}
$rv .= "\n";
return $rv;
} #/sub dumpRDF
sub dumpTriples
{
my @triples = shift->triples;
return Dumper(\@triples);
} #/sub dumpTriples
sub triples
{
my $this = shift;
my @rv;
my %RV;
my @metas = keys %{ $this->{Meta} };
foreach my $m (@metas)
{
for (my $i=0; defined $this->{Meta}->{$m}->[$i]; $i++)
{
my $M = $this->{Meta}->{$m}->[$i];
my ($subject, $value);
$subject = $M->{_subject} || '';
$subject = "<$subject>";
if ($M->{_attr} eq 'rel')
{
$value = '<'.$M->{href}.'>';
if ($M->{title})
{
$RV{$value}{''}{quoted($M->{title})} = 1;
}
}
elsif ($M->{_attr} eq 'rev')
{
$value = '<>';
if ($M->{title})
{
$RV{$value}{''}{quoted($M->{title})} = 1;
}
}
else
{
$value = quoted($M->{data});
}
$RV{$subject}{$m}{$value} = 1 if ($m =~ /^\);
}
}
# Using a hash to begin with eliminates duplicates.
foreach my $i (keys %RV)
{
foreach my $j (keys %{ $RV{$i} })
{
foreach my $k (keys %{ %{$RV{$i}}->{$j} })
{
push @rv, [($i, $j, $k)];
}
}
}
my $sort = Sort::ArrayOfArrays->new(
sort_column => '0,1,2',
results => \@rv
);
return $sort->sort_it;
} #/sub triples
sub parse_rfc2731_namespaces
# Discover namespaces for metadata conforming to RFC 2731. Note
# that eRDF uses the same format for metadata schemas, so we
# can re-use these for eRDF too.
{
my $this = shift;
# We'll start by defining some namespaces for metadata.
# The document is free to redefine them, but it seems to
# work better if we define these by default.
$this->{metadata_ns} = {
# Some people forget to manually specify these
'DC' => {
nsurl => 'http://purl.org/dc/elements/1.1/',
title => 'Dublin Core 1.1'
},
'DCTERMS' => {
nsurl => 'http://purl.org/dc/terms/',
title => 'Dublin Core Terms'
},
# Pseudo-schema for .
'HTTP11' => {
nsurl => 'urn:ietf:rfc:2616#',
title => 'Hypertext Transfer Protocol 1.1'
},
# Metaschema for RFC2731 itself
'SCHEMA' => {
nsurl => 'urn:ietf:rfc:2731#',
title => "The 'Schema' Metaschema"
},
# Defined by Atom, not strictly RFC2731-compliant, but can be
# treated as such. (service.feed, service.edit, service.post)
'SERVICE' => {
nsurl => 'http://www.w3.org/2007/app#',
title => 'Atom Publishing Protocol'
},
# Defined by OpenID, not strictly RFC2731-compliant, but can be
# treated as such. (openid.server, openid.delegate)
'OPENID' => {
nsurl => 'http://openid.net/specs/openid-authentication-1_1.html#',
title => 'OpenID 1.1'
}
};
foreach my $a ($this->{DOM}->getDocumentElement->getAttributes->getValues)
{
my $k = $a->getName;
my $v = $a->getValue;
next unless $k =~ /^xmlns\:/i;
$k =~ s/^xmlns\://i;
$this->{metadata_ns}->{$k} = {
nsurl => $v,
title => "The '$k' Namespace",
xmlns => 1
};
$this->{xmlns}->{$k} = {
nsurl => $v,
title => "The '$k' Namespace"
};
}
# Find custom RFC 2731 namespaces. FOAF and other stuff goes here.
foreach my $l ($this->{DOM}->getElementsByTagName('link'))
{
my $rel = $l->getAttribute('rel');
if ($rel =~ /^schema[\.\:]([A-Z_][A-Z0-9_-]*)$/i)
{
my $prefix = uc($1);
my $url = $this->abs_url($l->getAttribute('href'));
my $title = $l->getAttribute('title');
$this->{metadata_ns}->{$prefix}->{nsurl} = $url;
$this->{metadata_ns}->{$prefix}->{title} = $title || "The '$prefix' Schema";
}
}
} #/sub parse_rfc2731_namespaces
sub parse_metadata
{
my $this = shift;
$this->parse_rfc2731_namespaces;
# Parse links
my @links = $this->{DOM}->getElementsByTagName('link');
my @anchors = $this->{DOM}->getElementsByTagName('a');
push @links, @anchors;
foreach my $l (@links)
{
my $rel = $l->getAttribute('rel');
if (length $rel)
{
$rel =~ s/(^\s+|\s+$)//g;
$rel =~ s/\s+/ /g;
my @rel = split / /, $rel;
foreach my $r (@rel)
{
my $nsrel = $this->_pfx2fq($r, undef, $l);
push @{ $this->{Meta}->{$nsrel} },
{
title => $l->getAttribute('title'),
media => $l->getAttribute('media'),
data => ($l->getTagName eq 'a') ? Cognition::HTMLParser::Utils::STRINGIFY($l) : $l->getAttribute('href'),
href => $this->abs_url($l->getAttribute('href')),
type => $l->getAttribute('type'),
lang => $l->getAttribute('lang'),
_meta => $r,
_xmeta => $nsrel,
_node => $l,
_tag => $l->getTagName,
_attr => 'rel'
};
}
}
my $rev = $l->getAttribute('rev');
if (length $rev)
{
$rev =~ s/(^\s+|\s+$)//g;
$rev =~ s/\s+/ /g;
my @rev = split / /, $rev;
foreach my $r (@rev)
{
my $nsrev = $this->_pfx2fq($r, undef, $l);
push @{ $this->{Meta}->{$nsrev} },
{
title => $l->getAttribute('title'),
media => $l->getAttribute('media'),
data => ($l->getTagName eq 'a') ? Cognition::HTMLParser::Utils::STRINGIFY($l) : $l->getAttribute('href'),
href => $this->abs_url($l->getAttribute('href')),
type => $l->getAttribute('type'),
lang => $l->getAttribute('lang'),
rev => $rev,
_xmeta => $nsrev,
_meta => $r,
_node => $l,
_tag => $l->getTagName,
_attr => 'rev',
_subject=> $this->abs_url($l->getAttribute('href'))
};
}
}
}
# Parse
foreach my $m ($this->{DOM}->getElementsByTagName('meta'))
{
my $name = $m->getAttribute('http-equiv');
if (length $name)
{
my $nsname = $this->_pfx2fq('HTTP11.'.$name);
push @{ $this->{Meta}->{$nsname} },
{
data => $m->getAttribute('content'),
scheme => $this->_pfx2fq($m->getAttribute('scheme')),
'http-equiv' => $name,
_meta => $name,
_xmeta => $nsname,
_node => $m,
_tag => $m->getTagName,
_attr => 'http-equiv'
};
}
else
{
$name = $m->getAttribute('name');
next unless (length $name);
$name =~ s/(^\s+|\s+$)//g;
$name =~ s/\s+/ /g;
my @name = split / /, $name;
foreach my $n (@name)
{
my $nsname = $this->_pfx2fq($n, undef, $m);
push @{ $this->{Meta}->{$nsname} },
{
data => $m->getAttribute('content'),
lang => $m->getAttribute('lang'),
scheme => $this->_pfx2fq($m->getAttribute('scheme')),
name => $name,
_meta => $n,
_xmeta => $nsname,
_node => $m,
_tag => $m->getTagName,
_attr => 'name'
};
}
}
}
# Parse HTTP headers.
my @headers = $this->{RESPONSE}->header_field_names;
foreach my $h (@headers)
{
my $nsname = $this->_pfx2fq('HTTP11.'.$h);
my @vals = $this->{RESPONSE}->header($h);
foreach my $v (@vals)
{
push @{ $this->{Meta}->{$nsname} },
{
data => $v,
'http-equiv' => $h,
_meta => $h,
_xmeta => $nsname,
_tag => '_http_header'
};
}
}
# Parse .
foreach my $t ($this->{DOM}->getElementsByTagName('title'))
{
push @{ $this->{Meta}->{'title'} },
{
data => Cognition::HTMLParser::Utils::STRINGIFY($t),
lang => $t->getAttribute('lang'),
_meta => 'title',
_xmeta => 'title',
_node => $t,
_tag => $t->getTagName
};
}
# Parse HTML role="" attribute, filling _element, data, url (if _element=='a'), lang
# Parse eRDF .
# We do not support the specialised "rdf:type" notation though.
foreach my $e ($this->{DOM}->getElementsByTagName('*'))
{
my $role = $e->getAttribute('role');
if (length $role)
{
$role =~ s/(^\s+|\s+$)//g;
$role =~ s/\s+/ /g;
my @role = split / /, $role;
foreach my $r (@role)
{
my $nsrole = $this->_pfx2fq($r, undef, $e);
push @{ $this->{Meta}->{$nsrole} },
{
title => $e->getAttribute('title'),
lang => $e->getAttribute('lang'),
href => $this->abs_url($e->getAttribute('href')),
data => Cognition::HTMLParser::Utils::STRINGIFY($e),
role => $role,
_meta => $r,
_xmeta => $nsrole,
_node => $e,
_tag => $e->getTagName,
_attr => 'role'
};
}
}
$role = $e->getAttribute('class');
if (length($role))
{
$role =~ s/(^\s+|\s+$)//g;
$role =~ s/\s+/ /g;
my @role = split / /, $role;
foreach my $r (@role)
{
my $nsrole = $this->_pfx2fq($r, '-', $e);
next if ($nsrole eq lc($r)); # Only allow namespaced classes.
my $parent = $e;
my $subject = undef;
while (!defined $subject && $parent->getNodeType==ELEMENT_NODE)
{
if ($parent->getAttribute('id'))
{
$subject = $parent;
}
else
{
$parent = $parent->getParentNode();
}
}
push @{ $this->{Meta}->{$nsrole} },
{
title => $e->getAttribute('title'),
lang => $e->getAttribute('lang'),
href => $this->abs_url($e->getAttribute('href')),
data => $e->getAttribute('title') || Cognition::HTMLParser::Utils::STRINGIFY($e),
class => $role,
_meta => $r,
_xmeta => $nsrole,
_node => $e,
_tag => $e->getTagName,
_attr => 'class',
_subject=> $parent->getNodeType==ELEMENT_NODE ? ('#' . $parent->getAttribute('id')) : ''
};
}
}
}
} #/sub parse_metadata
sub _pfx2fq
{
my $this = shift;
my $key = shift;
my $mode = shift;
my $dom = shift;
my ($ns, $term, $nsurl);
$mode = '.:' unless length $mode;
# In "dash mode" allows the hyphen to be used as a namespace
# prefix separator. This is for eRDF.
if (($mode eq '-' && $key =~ /^([A-Z_][A-Z0-9_]*)[\.\:-](.+)$/i)
# Normally, dash is allowed as part of the namespace prefix.
|| ($mode eq '.:' && $key =~ /^([A-Z_][A-Z0-9_-]*)[\.\:](.+)$/i))
{
$ns = $1;
$term = $2;
$nsurl = $this->{metadata_ns}->{uc($ns)}->{nsurl};
# If we haven't found the namespace yet, then check xmlns:* attributes.
if ($dom && !length $nsurl)
{
$nsurl = Cognition::HTMLParser::Utils::xmlns($ns, $dom);
$this->{xmlns}->{$ns} = {
nsurl => $nsurl,
xmlns => 1
};
}
# In eRDF, cope with undefined prefixes by assuming that
# the class name was never intended as eRDF.
if ($mode eq '-' && !$nsurl)
{
return $key;
}
# Otherwise, if an undefined prefix is used, complain quietly.
$nsurl = 'http://undefined-namespace-prefix.invalid/' if (!$nsurl);
return '<'.$nsurl.$term.'>';
}
# If no valid prefix is found, treat as case-insensitive default namespace
return lc($key);
} #/sub _pfx2fq
sub _fq2pfx
{
my $this = shift;
my $key = shift;
my $mode = shift || ':';
$key =~ s/(^\<|\>$)//g;
$key =~ /^(.*[\#\/])([^\#\/]+)$/;
my $nsurl = $1;
$key = $2;
my $pfx;
foreach my $k (keys %{ $this->{metadata_ns} })
{
if ($this->{metadata_ns}->{$k}->{nsurl} eq $nsurl)
{
$pfx = lc($k);
return $pfx.$mode.$key;
}
}
foreach my $k (keys %{ $this->{xmlns} })
{
if ($this->{xmlns}->{$k}->{nsurl} eq $nsurl)
{
$pfx = $k;
return $pfx.$mode.$key;
}
}
return $key;
} #/sub _fq2pfx
sub abs_url
{
my $this = shift;
my $url = url shift, $this->{BASE};
return $url->abs->as_string;
} #/sub abs_url
sub parse_uformats
{
my $this = shift;
my (@tmp1, @tmp2, @tmp3, @tmp4, @tmp5, @tmp6);
(@tmp1 = Cognition::HTMLParser::hCard::parse_all($this)) && ($this->{uF}->{hCard} = \@tmp1);
(@tmp2 = Cognition::HTMLParser::adr::parse_all($this)) && ($this->{uF}->{adr} = \@tmp2);
(@tmp3 = Cognition::HTMLParser::geo::parse_all($this)) && ($this->{uF}->{geo} = \@tmp3);
(@tmp4 = Cognition::HTMLParser::Rel::Tag::parse_all($this)) && ($this->{uF}->{Tag} = \@tmp4);
(@tmp5 = Cognition::HTMLParser::hCalendar::parse_all($this)) && ($this->{uF}->{hCalendar} = \@tmp5);
(@tmp6 = Cognition::HTMLParser::Rel::License::parse_all($this)) && ($this->{uF}->{License} = \@tmp6);
# hAtom
# rel-license + CC license
# rel-enclosure
} #/sub parse_uformats
sub parse_doc_structure
{
my $this = shift;
my $rv = { kids=>() };
my $lvl = 1;
my $ptr = $rv;
my $count_lists;
my $count_tables;
my $count_figures;
# Create my own pseudo-uF: list of semantic tables (i.e. having
or summary)
# Create my own pseudo-uF: list of figures (i.e. )
# Create my own pseudo-uF: list of semantic lists (i.e.
)
# Parse heading structure, including figures, tables and lists}
foreach my $e ($this->{DOM}->getElementsByTagName('*'))
{
my $tag = $e->getTagName;
if ($tag eq 'figure' || $e->getAttribute('class') =~ / (^|\s) figure (\s|$) /x)
{
my $node = {};
push @{ $ptr->{kids}->[-1]->{kids} }, $node;
$node->{tag} = 'figure';
$node->{count} = ++$count_figures;
$node->{figure} = Cognition::HTMLParser::figure::parse($this, $e);
$node->{_dom} = $e;
}
elsif ($e->getAttribute('class') =~ / (^|\s) xoxo (\s|$) /x)
{
my $node = {};
push @{ $ptr->{kids}->[-1]->{kids} }, $node;
$node->{tag} = 'xoxo';
$node->{count} = ++$count_lists;
$node->{figure} = Cognition::HTMLParser::xoxo::parse($this, $e);
$node->{_dom} = $e;
}
elsif ($tag =~ /^h([1-6])$/i)
{
my $thisLevel = $1;
my $node = {};
while ($thisLevel > $lvl)
{
my $parent = $ptr->{kids}->[-1];
$parent->{up} = $ptr;
$ptr = $parent;
$lvl++;
}
while ($thisLevel < $lvl)
{
$lvl--;
$ptr = $ptr->{up};
}
if ($thisLevel == $lvl)
{
push @{ $ptr->{kids} }, $node;
}
$node->{tag} = $tag;
$node->{_dom} = $e;
$node->{heading} = Cognition::HTMLParser::Utils::STRINGIFY($e);
}
elsif ( $tag eq 'table' && (length $e->getAttribute('summary') || defined $e->getElementsByTagName('caption')->item(0)) )
{
my $node = {};
push @{ $ptr->{kids}->[-1]->{kids} }, $node;
$node->{tag} = $tag;
$node->{_dom} = $e;
$node->{summary} = $e->getAttribute('summary');
$node->{count} = ++$count_tables;
my @captions = $e->getElementsByTagName('caption');
if (@captions)
{
$node->{caption} = Cognition::HTMLParser::Utils::STRINGIFY($captions[0]);
}
}
}
$this->{Structure} = $rv;
return $rv;
} #/sub parse_doc_structure
sub html2xhtml
{
my $this = shift;
my $html = shift;
$html =~ s/\<\!DOCTYPE[^>]*\>//i;
my $t = HTML::TreeBuilder->new();
$t->implicit_tags(1);
$t->p_strict(1);
$t->parse_content($html);
$t->{'xmlns'} = 'http://www.w3.org/1999/xhtml';
my $xhtml = "\n"
. "\n"
. $t->as_XML(undef, "\t", {
'area'=>1,
'base'=>1,
'basefont'=>1,
'br'=>1,
'col'=>1,
'frame'=>1,
'hr'=>1,
'img'=>1,
'input'=>1,
'isindex'=>1,
'link'=>1,
'meta'=>1,
'param'=>1
});
return $xhtml;
} #/sub html2xhtml
sub searchClass
{
my $this = shift;
my $target = shift;
my $dom = shift || $this->{DOM};
my $nodeList = $dom->getElementsByTagName('*');
my @matches = ();
for (my $i = 0; $i < $nodeList->getLength; $i++)
{
my $node = $nodeList->item($i);
my $classList = $node->getAttribute('class');
next unless (length $classList);
if ($classList =~ / (^|\s) $target (\s|$) /x)
{
push @matches, $node;
}
}
return @matches;
} #/sub searchClass
sub searchRel
{
my $this = shift;
my $target = shift;
my $dom = shift || $this->{DOM};
$target =~ s/[\:\.]/\[\:\.\]/;
my $nodeList = $dom->getElementsByTagName('*');
my @matches = ();
for (my $i = 0; $i < $nodeList->getLength; $i++)
{
my $node = $nodeList->item($i);
my $classList = $node->getAttribute('rel');
next unless (length $classList);
if ($classList =~ / (^|\s) $target (\s|$) /ix)
{
push @matches, $node;
}
}
return @matches;
} #/sub searchRel
sub searchID
{
my $this = shift;
my $target = shift;
my $dom = shift || $this->{DOM};
my $nodeList = $dom->getElementsByTagName('*');
my @matches = ();
for (my $i = 0; $i < $nodeList->getLength; $i++)
{
my $node = $nodeList->item($i);
my $id = $node->getAttribute('id') || next;
if ($id eq $target)
{
return $node;
}
}
} #/sub searchID
1; #/package Cognition::HTMLParser
######################################################################
package Cognition::HTMLParser::Utils;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub STRINGIFY
{
my $domNode = shift;
my $withClass = shift || undef;
my $inClass = shift || 0;
if (defined $withClass && !$inClass && $domNode->getNodeType == ELEMENT_NODE)
{
my $c = $domNode->getAttribute('class');
$inClass = 1 if ($c =~ / (^|\s) $withClass (\s|$) /x);
}
if ($inClass || !defined $withClass)
{
if ($domNode->getNodeType == TEXT_NODE)
{
return $domNode->getData;
}
elsif ($domNode->getNodeType == ELEMENT_NODE && $domNode->getTagName eq 'img')
{
return $domNode->getAttribute('alt');
}
elsif ($domNode->getNodeType == ELEMENT_NODE)
{
my @parts;
foreach my $child ($domNode->getChildNodes)
{
push @parts, STRINGIFY($child, $withClass, $inClass);
}
return join(' ', @parts);
}
}
}
sub xmlns
{
my $ns = shift;
my $dom = shift;
while ($dom->getNodeType==ELEMENT_NODE)
{
if (length $dom->getAttribute("xmlns:$ns"))
{
return $dom->getAttribute("xmlns:$ns");
}
$dom = $dom->getParentNode;
}
if ($dom->getNodeType==DOCUMENT_NODE)
{
if (length $dom->getDocumentElement->getAttribute("xmlns:$ns"))
{
return $dom->getDocumentElement->getAttribute("xmlns:$ns");
}
}
return undef;
}
1; #/package Cognition::HTMLParser::Utils
######################################################################
package Cognition::HTMLParser::Microformats;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub ufp_abbr
# Implements the microformats ABBR pattern.
{
my $node = shift;
my $page = shift;
my $elem = shift || 'abbr';
my $attr = shift || 'title';
my $rv = 0;
my $abbrs = $node->getElementsByTagName('*');
for (my $i=0; $i < $abbrs->getLength; $i++)
{
# For each ABBR element...
my $abbr = $abbrs->item($i);
# Skip pattern if no title attribute found.
my $title = $abbr->getAttribute($attr) || next;
# ABBR element (or other element specified as third parameter) should
# *always* be expanded. Other elements should only be expanded if the
# title attribute (or other attribute, fourth parameter) includes the
# string 'data:'.
next
unless ($abbr->getTagName eq $elem
|| $abbr->getAttribute($attr) =~ /data\:/);
# Implement Andy Mabbett's suggested "data:" prefix
if ($title =~ / [\(\[] data\: (.*) [\)\]] /x
|| $title =~ / data\: (.*) $ /x )
{
$title = $1;
}
# Remove child nodes
my $kids = $abbr->getChildNodes;
for (my $j=0; $j < $kids->getLength; $j++)
{
$abbr->removeChild($kids->item($j));
}
# Insert contents of title attribute
$abbr->addText($title) && $rv++;
}
# Return number of replacements made.
return $rv;
} # /sub ufp_abbr
sub ufp_include
# Implements the standard microformats include pattern.
{
my $node = shift;
my $page = shift;
my $class = shift || 'include';
my $rv = 0;
my $links = $node->getElementsByTagName('a');
for (my $i=0; $i < $links->getLength; $i++)
{
# For each link...
my $link = $links->item($i);
# Skip pattern if no class attribute found.
my $classList = $link->getAttribute('class') || next;
# We've found a use of the include pattern
if ($classList =~ / (^|\s) $class (\s|$) /x)
{
my $href = $link->getAttribute('href');
my $id = undef;
if ($href =~ /^\#(.*)$/)
{
$id = $1;
}
else
{
next;
}
my $replacement = $page->searchID($id)->cloneNode(1) || next;
$link->getParentNode->replaceChild($replacement, $link) && $rv++;
}
}
# Return number of replacements made.
return $rv;
} # /sub ufp_include
sub ufp_include2
# Implements the alternative microformats include pattern.
{
my $node = shift;
my $page = shift;
my $classpfx = shift || '#';
my $rv = 0;
# For each element...
foreach my $elem ($node->getElementsByTagName('*'))
{
# Skip pattern if no class attribute found.
my $classList = $elem->getAttribute('class');
next unless ($classList =~ / $classpfx /x);
my $atEnd = 0;
$classList =~ s/(^\s|\s$)//g;
$classList =~ s/\s+/ /g;
my @classes = split / /, $classList;
my @newClassList = ();
foreach my $c (@classes)
{
if (substr($c,0,1) ne $classpfx && length($c)>1)
{
push @newClassList, $c;
$atEnd = 1;
next;
}
my $id = $c; $id =~ s/^\#//x;
my $replacement = $page->searchID($id) || next;
$replacement = $replacement->cloneNode(1);
if ($atEnd)
{
$elem->appendChild($replacement) && $rv++;
}
else
{
$elem->insertBefore($replacement, $elem->getFirstChild) && $rv++;
}
}
$elem->setAttribute('class', join(' ', @newClassList));
}
# Return number of replacements made.
return $rv;
} # /sub ufp_include2
1; #/package Cognition::HTMLParser::Microformats
######################################################################
package Cognition::HTMLParser::hCard;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub MAX_INCLUDE_ITERATIONS { 6; }
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchClass('vcard', $within);
my @rv;
foreach my $card (@nodes)
{
my $hcard = parse($page, $card);
push @rv, $hcard;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $hcard_node = $rv->{'_dom'}->cloneNode(1);
my $id = $hcard_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
# Expand microformat patterns.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
{
$replacements = Cognition::HTMLParser::Microformats::ufp_include($hcard_node, $page)
+ Cognition::HTMLParser::Microformats::ufp_include2($hcard_node, $page);
$incl_iterations++;
}
Cognition::HTMLParser::Microformats::ufp_abbr($hcard_node, $page);
# Pull out nested hcards to prevent them being parsed as within this hcard.
# Check to see if they have an 'agent' class, and if so, parse them as an agent.
my @nested = $page->searchClass('vcard', $hcard_node);
foreach my $h (@nested)
{
if ($h->getAttribute('class') =~ / (^|\s) agent (\s|$) /x)
{
push @{ $rv->{agent} }, parse($page, $h);
}
$h->getParentNode->removeChild($h);
}
# Categories
my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $hcard_node);
$rv->{category} = \@tags unless (!@tags);
# Pull out embedded addresses.
my @adrs = Cognition::HTMLParser::adr::parse_all($page, $hcard_node);
$rv->{adr} = \@adrs unless (!@adrs);
# Pull out embedded locations.
my @geos = Cognition::HTMLParser::geo::parse_all($page, $hcard_node);
$rv->{geo} = \@geos unless (!@geos);
# Parse organisations.
my @orgs = $page->searchClass('org', $hcard_node);
foreach my $org (@orgs)
{
my @orgnames = $page->searchClass('organization-name', $org);
my @orgunits = $page->searchClass('organization-unit', $org);
my $parsed_org = {};
if (!@orgnames)
{
$parsed_org = {
'organisation-unit' => undef,
'organisation-name' => Cognition::HTMLParser::Utils::STRINGIFY($org)
};
}
else
{
my $orgname = $orgnames[0];
my $orgunit = $orgunits[0];
$parsed_org = {
'organisation-unit' => Cognition::HTMLParser::Utils::STRINGIFY($orgname),
'organisation-name' => Cognition::HTMLParser::Utils::STRINGIFY($orgunit)
};
}
push @{ $rv->{org} }, $parsed_org;
}
# Simple attributes - singles
my @simples = qw(fn bday tz sort-string uid class family-name given-name
additional-name honorific-prefix honorific-suffix);
foreach my $simple (@simples)
{
my @value_nodes = $page->searchClass($simple, $hcard_node);
my $value_node = $value_nodes[0];
next unless ($value_node);
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
if ($simple eq 'uid')
{
if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
{ $value = $value_node->getAttribute('href'); }
elsif ($value_node->getTagName eq 'img')
{ $value = $value_node->getAttribute('src'); }
elsif ($value_node->getTagName eq 'object')
{ $value = $value_node->getAttribute('data'); }
}
$rv->{$simple} = $value;
}
# Simple attributes - possible duplicates
my @simples = qw(nickname url label title role photo sound key logo mailer
category note rev agent);
foreach my $simple (@simples)
{
my @value_nodes = $page->searchClass($simple, $hcard_node);
foreach my $value_node (@value_nodes)
{
next unless ($value_node);
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
if ($simple =~ /^(url|photo|logo|sound|key)$/)
{
if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
{ $value = $value_node->getAttribute('href'); }
elsif ($value_node->getTagName eq 'img')
{ $value = $value_node->getAttribute('src'); }
elsif ($value_node->getTagName eq 'object')
{ $value = $value_node->getAttribute('data'); }
}
push @{$rv->{$simple}}, $value;
}
}
# Telephone numbers and e-mail addresses
my @comms = ('tel', 'email');
foreach my $comm (@comms)
{
my @nodes = $page->searchClass($comm, $hcard_node);
foreach my $n (@nodes)
{
my @accumValue;
my @accumType;
foreach my $v ($page->searchClass('value', $n))
{
if ($v->getTagName eq 'a' || $v->getTagName eq 'area')
{
my $href = $v->getAttribute('href');
$href =~ s/(^[a-z]+\:|\?.*$)//g;
push @accumValue, $href;
}
elsif ($v)
{
push @accumValue, Cognition::HTMLParser::Utils::STRINGIFY($v);
}
}
foreach my $v ($page->searchClass('type', $n))
{
push @accumType, uc(Cognition::HTMLParser::Utils::STRINGIFY($v));
$v->getParentNode->removeChild($v);
}
my $val = join '', @accumValue;
if ($val !~ /[0-9a-z]/i)
{
$val = Cognition::HTMLParser::Utils::STRINGIFY($n);
}
push @{ $rv->{$comm} }, {
'value' => $val,
'type' => \@accumType
};
}
}
# Check to see if this contact card is for an individual contact or an org.
# If for an org, disable n-optimisation.
my $do_n_opt = 1;
$rv->{_type} = 'contact';
foreach my $org (@{ $rv->{org} })
{
if (lc($org->{'organisation-name'}) eq lc($rv->{fn}))
{
$rv->{_type} = 'organisation';
$do_n_opt = 0;
last;
}
}
# Perform n-optimisation.
if ($do_n_opt)
{
foreach my $f (qw(family-name given-name additional-name honorific-prefix honorific-suffix))
{
if (length $rv->{$f})
{
$do_n_opt = 0;
}
}
if ($do_n_opt)
{
$rv->{fn} =~ s/(^\s|\s$)//g;
$rv->{fn} =~ s/\s+/ /g;
my @words = split / /, $rv->{fn};
if (scalar @words == 1)
{
$rv->{nickname} = $words[0];
}
elsif (scalar @words)
{
if ($words[0] =~ /^.*\,$/ || $words[1] =~ /^.\.?$/)
{
$words[0] =~ s/[\.\,]$//;
$words[1] =~ s/[\.\,]$//;
$rv->{'given-name'} = $words[1];
$rv->{'family-name'} = $words[0];
}
else
{
$rv->{'given-name'} = $words[0];
$rv->{'family-name'} = $words[-1];
}
}
}
}
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::hCard
######################################################################
package Cognition::HTMLParser::hCalendar;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub MAX_INCLUDE_ITERATIONS { 2; }
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchClass('hcalendar', $within);
my @rv;
if (!@nodes && $within->getNodeType==DOCUMENT_NODE)
{
@nodes = ($within->getDocumentElement);
}
elsif (!@nodes)
{
@nodes = ($within);
}
foreach my $hc (@nodes)
{
my $hc_parsed = parse($page, $hc);
push @rv, $hc_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $hc_node = $rv->{'_dom'}->cloneNode(1);
my $id = $hc_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
# Expand microformat patterns.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
{
$replacements = Cognition::HTMLParser::Microformats::ufp_include($hc_node, $page)
+ Cognition::HTMLParser::Microformats::ufp_include2($hc_node, $page);
$incl_iterations++;
}
Cognition::HTMLParser::Microformats::ufp_abbr($hc_node, $page);
# Pull out nested hevents.
my @events = Cognition::HTMLParser::hCalendarEvent::parse_all($page, $hc_node);
$rv->{vevent} = \@events unless (!@events);
# Simple attributes - singles
my @simples = qw(version prodid);
foreach my $simple (@simples)
{
my @value_nodes = $page->searchClass($simple, $hc_node);
my $value_node = $value_nodes[0];
next unless ($value_node);
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
$rv->{$simple} = $value;
}
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::hCalendar
######################################################################
package Cognition::HTMLParser::hCalendarEvent;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub MAX_INCLUDE_ITERATIONS { 6; }
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchClass('vevent', $within);
my @rv;
foreach my $ev (@nodes)
{
push @rv, parse($page, $ev);
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $event_node = $rv->{'_dom'}->cloneNode(1);
my $id = $event_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
# Expand microformat patterns.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
{
$replacements = Cognition::HTMLParser::Microformats::ufp_include($event_node, $page)
+ Cognition::HTMLParser::Microformats::ufp_include2($event_node, $page);
$incl_iterations++;
}
Cognition::HTMLParser::Microformats::ufp_abbr($event_node, $page);
# Pull out nested hcards to prevent them being parsed as within this event.
# Check to see if they have a 'useful' class, and if so, parse them in.
my @nested = $page->searchClass('vcard', $event_node);
foreach my $h (@nested)
{
if ($h->getAttribute('class') =~ / (^|\s) organiser (\s|$) /x)
{
push @{ $rv->{organiser} }, Cognition::HTMLParser::hCard::parse($page, $h);
}
if ($h->getAttribute('class') =~ / (^|\s) contact (\s|$) /x)
{
push @{ $rv->{contact} }, Cognition::HTMLParser::hCard::parse($page, $h);
}
if ($h->getAttribute('class') =~ / (^|\s) attendee (\s|$) /x)
{
push @{ $rv->{attendee} }, Cognition::HTMLParser::hCard::parse($page, $h);
}
if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
{
push @{ $rv->{location} }, Cognition::HTMLParser::hCard::parse($page, $h);
}
$h->getParentNode->removeChild($h);
}
# Ditto for addresses
my @nested = $page->searchClass('adr', $event_node);
foreach my $h (@nested)
{
if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
{
push @{ $rv->{location} }, Cognition::HTMLParser::adr::parse($page, $h);
}
$h->getParentNode->removeChild($h);
}
# Ditto for geo
my @nested = $page->searchClass('geo', $event_node);
foreach my $h (@nested)
{
if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x)
{
push @{ $rv->{location} }, Cognition::HTMLParser::geo::parse($page, $h);
}
else
{
push @{ $rv->{geo} }, Cognition::HTMLParser::geo::parse($page, $h);
}
$h->getParentNode->removeChild($h);
}
# Categories
my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $event_node);
$rv->{category} = \@tags unless (!@tags);
# Simple attributes - singles
my @simples = qw(class description dtend dtstart duration location status
summary uid url last-modified rdate rrule);
foreach my $simple (@simples)
{
my @value_nodes = $page->searchClass($simple, $event_node);
my $value_node = $value_nodes[0];
next unless ($value_node);
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
if ($simple =~ /^(url|uid)$/)
{
if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area')
{ $value = $value_node->getAttribute('href'); }
elsif ($value_node->getTagName eq 'img')
{ $value = $value_node->getAttribute('src'); }
elsif ($value_node->getTagName eq 'object')
{ $value = $value_node->getAttribute('data'); }
}
$rv->{$simple} = $value;
}
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::hCalendarEvent
######################################################################
package Cognition::HTMLParser::adr;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub MAX_INCLUDE_ITERATIONS { 2; }
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchClass('adr', $within);
my @rv;
foreach my $adr (@nodes)
{
my $adr_parsed = parse($page, $adr);
push @rv, $adr_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $adr_node = $rv->{'_dom'}->cloneNode(1);
my $id = $adr_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
# Expand microformat patterns.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
{
$replacements = Cognition::HTMLParser::Microformats::ufp_include($adr_node, $page)
+ Cognition::HTMLParser::Microformats::ufp_include2($adr_node, $page);
$incl_iterations++;
}
Cognition::HTMLParser::Microformats::ufp_abbr($adr_node, $page);
# Simple attributes - possible duplicates
my @simples = qw(post-office-box extended-address street-address locality
region postal-code country-name type);
foreach my $simple (@simples)
{
my @value_nodes = $page->searchClass($simple, $adr_node);
foreach my $value_node (@value_nodes)
{
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
push @{$rv->{$simple}}, $value;
}
}
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::adr
######################################################################
package Cognition::HTMLParser::geo;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub MAX_INCLUDE_ITERATIONS { 2; }
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchClass('geo', $within);
my @rv;
foreach my $geo (@nodes)
{
my $geo_parsed = parse($page, $geo);
push @rv, $geo_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $geo_node = $rv->{'_dom'}->cloneNode(1);
my $id = $geo_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
# Expand microformat patterns.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
{
$replacements = Cognition::HTMLParser::Microformats::ufp_include($geo_node, $page)
+ Cognition::HTMLParser::Microformats::ufp_include2($geo_node, $page);
$incl_iterations++;
}
Cognition::HTMLParser::Microformats::ufp_abbr($geo_node, $page);
# Simple attributes - singles
my @simples = qw(longitude latitude body reference-frame altitude);
foreach my $simple (@simples)
{
my @value_nodes = $page->searchClass($simple, $geo_node);
my $value_node = $value_nodes[0];
next unless ($value_node);
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
$rv->{$simple} = $value;
}
if (!length($rv->{longitude}) || !length($rv->{latitude}))
{
my $str = $geo_node->toString;
if ($geo_node->getTagName eq 'img' || $geo_node->getTagName eq 'area')
{ $str = $geo_node->getAttribute('alt'); }
$str =~ s/\,/\./g;
if ($str =~ / \s* (\-?[0-9\.]+) \s* \; \s* (\-?[0-9\.]+) \s* /x)
{
$rv->{latitude} = $1;
$rv->{longitude} = $2;
}
}
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::geo
######################################################################
package Cognition::HTMLParser::figure;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub MAX_INCLUDE_ITERATIONS { 2; }
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchClass('figure', $within);
my @rv;
foreach my $figure (@nodes)
{
my $figure_parsed = parse($page, $figure);
push @rv, $figure_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $figure_node = $rv->{'_dom'}->cloneNode(1);
$rv->{_dom} = undef;
my $id = $figure_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
# Expand microformat patterns.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements)
{
$replacements = Cognition::HTMLParser::Microformats::ufp_include($figure_node, $page)
+ Cognition::HTMLParser::Microformats::ufp_include2($figure_node, $page);
$incl_iterations++;
}
Cognition::HTMLParser::Microformats::ufp_abbr($figure_node, $page);
# Extract embedded hCards, which may be used as credits
my @nested = $page->searchClass('hcard', $figure_node);
foreach my $h (@nested)
{
if ($h->getAttribute('class') =~ / (^|\s) credit (\s|$) /x)
{ push @{ $rv->{credit} }, Cognition::HTMLParser::hcard::parse($page, $h); }
$h->getParentNode->removeChild($h);
}
# Additional credits
my @value_nodes = $page->searchClass('credit', $figure_node);
foreach my $value_node (@value_nodes)
{
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
push @{ $rv->{'credit'} }, $value;
}
# Legend
my @value_nodes = $page->searchClass('legend', $figure_node);
my $value_node = $value_nodes[0];
if ($value_node)
{
my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node);
$value = $value_node->getAttribute('title') if ($value_node->getTagName eq 'img');
$rv->{'legend'} = $value;
}
# Categories
my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $figure_node);
$rv->{category} = \@tags unless (!@tags);
# Find image
my $img;
if ($figure_node->getTagName eq 'img')
{
$img = $figure_node;
}
else
{
my @images = $figure_node->getElementsByTagName('img');
$img = $images[0];
}
$rv->{image} = $page->abs_url($img->getAttribute('src'));
$rv->{title} = $img->getAttribute('title') unless (length $rv->{'legend'});
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::figure
######################################################################
package Cognition::HTMLParser::Rel::Tag;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @nodes = $page->searchRel('tag', $within);
my @rv;
foreach my $t (@nodes)
{
my $t_parsed = parse($page, $t);
push @rv, $t_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $t_node = $rv->{'_dom'}->cloneNode(1);
my $id = $t_node->getAttribute('id');
$rv->{_id} = $id if (length $id);
my $ts;
my $tag = $t_node->getAttribute('href');
$tag =~ s/\#.*$//;
$tag =~ s/\?.*$//;
$tag =~ s/\/$//;
if ($tag =~ m{^(.*/)([^/]+)$})
{
$rv->{tagspace} = $1;
$rv->{tag} = $2;
}
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::Rel::Tag
######################################################################
package Cognition::HTMLParser::Rel::License;
######################################################################
use XML::DOM;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @rv;
# Unqualified 'license' rel.
my @nodes = $page->searchRel('license', $within);
# Various namespace-qualified alternatives...
push @nodes, $page->searchRel($page->_fq2pfx('http://web.resource.org/cc/license'), $within);
push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/terms/license'), $within);
push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/terms/rights.license'), $within);
push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/elements/1.1/rights.license'), $within);
foreach my $l (@nodes)
{
my $l_parsed = parse($page, $l);
push @rv, $l_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $t_node = $rv->{'_dom'}->cloneNode(1);
$rv->{'license'} = $t_node->getAttribute('href');
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::Rel::License
######################################################################
package Cognition::HTMLParser::xoxo;
######################################################################
use XML::DOM;
use XML::XOXO;
use strict;
BEGIN
{
our $VERSION = 0.1;
}
sub parse_all
{
my $page = shift;
my $within = shift || $page->{DOM};
my @rv;
my @nodes = $page->searchClass('xoxo', $within);
foreach my $l (@nodes)
{
my $l_parsed = parse($page, $l);
push @rv, $l_parsed;
}
return @rv;
} #/sub parse_all
sub parse
{
my $page = shift;
my $rv = { '_dom'=>shift };
my $list_node = $rv->{'_dom'}->cloneNode(1);
my $parser = XML::XOXO::Parser->new();
$rv->{'object'} = $parser->parse($list_node->toString);
bless $rv;
return $rv;
} #/sub parse
1; #/package Cognition::HTMLParser::xoxo
######################################################################
package main;
######################################################################
use strict;
use CGI qw(:standard);
use CGI::Carp 'fatalsToBrowser';
use Data::Dumper;
use LWP::UserAgent;
use LWP::RobotUA;
use HTTP::Request;
my $q = new CGI;
if ($q->request_method =~ /^(GET|POST|HEAD)$/)
{
print &header;
print "\n";
print "cognition/0.1-alpha2\n";
print "\n";
print "
cognition/0.1-alpha2
\n";
print "
\n";
my $url = $q->param('url');
if (length $url)
{
print "
\n";
my $ua = LWP::RobotUA->new(
agent => 'cognition/0.1-alpha2 ',
from => 'invalid@invalid.invalid',
delay => 0
);
my $request = HTTP::Request->new(GET => $url);
my $response = $ua->request($request);
my $x = Cognition::HTMLParser->new(
$response->content,
{
ua => $ua,
response => $response,
request => $request
}
);
print escapeHTML($x->dumpTextClean);
print "
\n";
}
else
{
print "
No URL entered!
\n";
}
}
else
{
my $ua = LWP::UserAgent->new(
agent => 'cognition/0.1-alpha2 '
);
my $url = shift @ARGV;
my $request = HTTP::Request->new(GET => $url);
my $response = $ua->request($request);
my $x = Cognition::HTMLParser->new(
$response->content,
{
ua => $ua,
response => $response,
request => $request
}
);
print $x->dumpTextClean;
}