#!/usr/bin/perl ###################################################################### # Cognition/0.1-alpha2 - A metadata browser # Copyright (c) 2008 Toby Inkster. ###################################################################### # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ###################################################################### ###################################################################### # Version History ###################################################################### # # cognition/0.1-alpha1 :- # # - initial release # - metadata: , , , @role, eRDF # - eRDF does not support rdf:type syntax # - RFC 2731 is supported for namespaces # - microformats: hcard, hcalendar, adr, geo # - hcalendar support assumes page is one giant calendar # - no support for rel-tag, so no support for categories in hcard or # hcalendar # - geo support includes body, altitiude and reference-frame extensions # - microformats patterns: include-pattern, abbr-pattern, extensions # - include-pattern supports my alternative syntax # - abbr-pattern supports Andy Mabbett's alternative # - RDF output of namespaced metadata # # cognition/0.1-alpha2 :- # # - drop usage of XML::XPath module, using XML::DOM instead # - might use XML::DOM::XPath in future if XPath support is needed # - support XML namespaces used as metadata namespaces. # - microformats: hcalendar (complete), rel-tag, rel-license, figure, xoxo # - rel-licence extended to support searches for 'license' in CC or # DCTERMS namespaces; or 'rights.license' in DC or DCTERMS namespaces # - experimental figure microformat based on current brainstorming # - parse document structure (headings + semantic tables + semantic # images/figures microformat? + xoxo lists) # # future work??? # # - add support for Person objects (hcard) to RDF output # - add support for Event objects to RDF output # - use <rdf:Bag> to wrap multiple tuples with the same subject and property # - microformats: hatom, hreview? # - support rel=meta to retrieve attitional document metadata, and parse RDF! # - RDFa? # ###################################################################### ###################################################################### package Cognition::HTMLParser; ###################################################################### use Data::Dumper; use HTML::Entities; use HTML::TreeBuilder; use HTTP::Request; use Sort::ArrayOfArrays; use URI::URL; use XML::DOM; use XML::Parser; #use XML::XPath; use strict; BEGIN { our $VERSION = 0.1; our $XDP = new XML::DOM::Parser('NoLWP' => 1); $Data::Dumper::Indent = 1; } sub new { my $self = shift; my $source = shift; my $opts = shift; my $this = { 'UA' => $opts->{ua}, 'REQUEST' => $opts->{request}, 'RESPONSE' => $opts->{response} }; $this->{RESPONSE}->header('content-type') =~ /(xml)/i; my $xml = (length $1) ? 1 : 0; $this->{BASE} = $this->{RESPONSE}->base(); if (0 && $xml) { $this->{_xml} = 1; $this->{_source} = $source; } else { $this->{_xml} = 0; $this->{_source} = $self->html2xhtml($source); $this->{_raw_source} = $source; } #my $xpp = XML::Parser->new( # ErrorContext => 2, # # ParseParamEnt => 1, # NoLWP => 1 #); #$xpp->setHandlers( # ExternEnt => 0, # ExternEntFin => 0 #); #$this->{XP} = XML::XPath->new(xml => $this->{_source}, parser=>$xpp); $this->{DOM} = $Cognition::HTMLParser::XDP->parse($this->{_source}); bless($this); $this->parse_metadata(); $this->parse_uformats(); $this->parse_doc_structure(); return $this; } #/sub new sub dumpText { my $this = shift; return Dumper($this); } #/sub dumpText sub dumpTextClean { my $this = shift; my $tmp = {}; $tmp->{DOM} = $this->{DOM}; $this->{DOM} = '...hidden...'; # $tmp->{XP} = $this->{XP}; $this->{XP} = '...hidden...'; $Data::Dumper::Sortkeys = \&_cleanKeys; my $txt = $this->dumpText; $this->{DOM} = $tmp->{DOM}; # $this->{XP} = $tmp->{XP}; return $txt; } #/sub dumpTextClean sub _cleanKeys { my $hash = shift; my @keys = sort grep {/^(_subject|_id|[^_])/} keys %{ $hash }; return \@keys; } sub quoted { return '"'.(shift).'"'; } #/sub quoted sub dumpRDF { my $this = shift; my $triples = $this->triples; my %lookup; my $rv = "<?xml version=\"1.0\"?>\n" . "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""; foreach my $ns (keys %{ $this->{metadata_ns} }) { my $nsurl = $this->{metadata_ns}->{$ns}->{nsurl}; next unless (length $nsurl); $lookup{$nsurl} = lc($ns); $rv .= "\n\txmlns:".lc($ns)."=\"".encode_entities($nsurl)."\""; } $rv .= ">\n\n"; my $current_subject = undef; for (my $i=0; defined $triples->[$i]; $i++) { my ($subject, $key, $value) = @{ $triples->[$i] }; my $stripped_subject; if ($subject ne $current_subject) { if (defined $current_subject) { $rv .= "\t</rdf:Description>\n\n"; } $current_subject = $subject; $stripped_subject = $subject; $stripped_subject =~ s/(^\<|\>$)//g; $rv .= "\t<rdf:Description rdf:about=\"".encode_entities($stripped_subject)."\">\n" } my $tag = $this->_fq2pfx($key); if ($value =~ /^\</) { $value =~ s/(^\<|\>$)//g; $rv .= "\t\t<$tag rdf:resource=\"".encode_entities($value)."\" />\n"; } else { $value =~ s/(^\"|\"$)//g; $rv .= "\t\t<$tag>".encode_entities($value)."</$tag>\n"; } } if (defined $current_subject) { $rv .= "\t</rdf:Description>\n\n"; } $rv .= "</rdf:RDF>\n"; return $rv; } #/sub dumpRDF sub dumpTriples { my @triples = shift->triples; return Dumper(\@triples); } #/sub dumpTriples sub triples { my $this = shift; my @rv; my %RV; my @metas = keys %{ $this->{Meta} }; foreach my $m (@metas) { for (my $i=0; defined $this->{Meta}->{$m}->[$i]; $i++) { my $M = $this->{Meta}->{$m}->[$i]; my ($subject, $value); $subject = $M->{_subject} || ''; $subject = "<$subject>"; if ($M->{_attr} eq 'rel') { $value = '<'.$M->{href}.'>'; if ($M->{title}) { $RV{$value}{'<http://purl.org/dc/elements/1.1/title>'}{quoted($M->{title})} = 1; } } elsif ($M->{_attr} eq 'rev') { $value = '<>'; if ($M->{title}) { $RV{$value}{'<http://purl.org/dc/elements/1.1/title>'}{quoted($M->{title})} = 1; } } else { $value = quoted($M->{data}); } $RV{$subject}{$m}{$value} = 1 if ($m =~ /^\</); } } # Using a hash to begin with eliminates duplicates. foreach my $i (keys %RV) { foreach my $j (keys %{ $RV{$i} }) { foreach my $k (keys %{ %{$RV{$i}}->{$j} }) { push @rv, [($i, $j, $k)]; } } } my $sort = Sort::ArrayOfArrays->new( sort_column => '0,1,2', results => \@rv ); return $sort->sort_it; } #/sub triples sub parse_rfc2731_namespaces # Discover namespaces for metadata conforming to RFC 2731. Note # that eRDF uses the same format for metadata schemas, so we # can re-use these for eRDF too. { my $this = shift; # We'll start by defining some namespaces for metadata. # The document is free to redefine them, but it seems to # work better if we define these by default. $this->{metadata_ns} = { # Some people forget to manually specify these 'DC' => { nsurl => 'http://purl.org/dc/elements/1.1/', title => 'Dublin Core 1.1' }, 'DCTERMS' => { nsurl => 'http://purl.org/dc/terms/', title => 'Dublin Core Terms' }, # Pseudo-schema for <meta http-equiv>. 'HTTP11' => { nsurl => 'urn:ietf:rfc:2616#', title => 'Hypertext Transfer Protocol 1.1' }, # Metaschema for RFC2731 itself 'SCHEMA' => { nsurl => 'urn:ietf:rfc:2731#', title => "The 'Schema' Metaschema" }, # Defined by Atom, not strictly RFC2731-compliant, but can be # treated as such. (service.feed, service.edit, service.post) 'SERVICE' => { nsurl => 'http://www.w3.org/2007/app#', title => 'Atom Publishing Protocol' }, # Defined by OpenID, not strictly RFC2731-compliant, but can be # treated as such. (openid.server, openid.delegate) 'OPENID' => { nsurl => 'http://openid.net/specs/openid-authentication-1_1.html#', title => 'OpenID 1.1' } }; foreach my $a ($this->{DOM}->getDocumentElement->getAttributes->getValues) { my $k = $a->getName; my $v = $a->getValue; next unless $k =~ /^xmlns\:/i; $k =~ s/^xmlns\://i; $this->{metadata_ns}->{$k} = { nsurl => $v, title => "The '$k' Namespace", xmlns => 1 }; $this->{xmlns}->{$k} = { nsurl => $v, title => "The '$k' Namespace" }; } # Find custom RFC 2731 namespaces. FOAF and other stuff goes here. foreach my $l ($this->{DOM}->getElementsByTagName('link')) { my $rel = $l->getAttribute('rel'); if ($rel =~ /^schema[\.\:]([A-Z_][A-Z0-9_-]*)$/i) { my $prefix = uc($1); my $url = $this->abs_url($l->getAttribute('href')); my $title = $l->getAttribute('title'); $this->{metadata_ns}->{$prefix}->{nsurl} = $url; $this->{metadata_ns}->{$prefix}->{title} = $title || "The '$prefix' Schema"; } } } #/sub parse_rfc2731_namespaces sub parse_metadata { my $this = shift; $this->parse_rfc2731_namespaces; # Parse links my @links = $this->{DOM}->getElementsByTagName('link'); my @anchors = $this->{DOM}->getElementsByTagName('a'); push @links, @anchors; foreach my $l (@links) { my $rel = $l->getAttribute('rel'); if (length $rel) { $rel =~ s/(^\s+|\s+$)//g; $rel =~ s/\s+/ /g; my @rel = split / /, $rel; foreach my $r (@rel) { my $nsrel = $this->_pfx2fq($r, undef, $l); push @{ $this->{Meta}->{$nsrel} }, { title => $l->getAttribute('title'), media => $l->getAttribute('media'), data => ($l->getTagName eq 'a') ? Cognition::HTMLParser::Utils::STRINGIFY($l) : $l->getAttribute('href'), href => $this->abs_url($l->getAttribute('href')), type => $l->getAttribute('type'), lang => $l->getAttribute('lang'), _meta => $r, _xmeta => $nsrel, _node => $l, _tag => $l->getTagName, _attr => 'rel' }; } } my $rev = $l->getAttribute('rev'); if (length $rev) { $rev =~ s/(^\s+|\s+$)//g; $rev =~ s/\s+/ /g; my @rev = split / /, $rev; foreach my $r (@rev) { my $nsrev = $this->_pfx2fq($r, undef, $l); push @{ $this->{Meta}->{$nsrev} }, { title => $l->getAttribute('title'), media => $l->getAttribute('media'), data => ($l->getTagName eq 'a') ? Cognition::HTMLParser::Utils::STRINGIFY($l) : $l->getAttribute('href'), href => $this->abs_url($l->getAttribute('href')), type => $l->getAttribute('type'), lang => $l->getAttribute('lang'), rev => $rev, _xmeta => $nsrev, _meta => $r, _node => $l, _tag => $l->getTagName, _attr => 'rev', _subject=> $this->abs_url($l->getAttribute('href')) }; } } } # Parse <meta> foreach my $m ($this->{DOM}->getElementsByTagName('meta')) { my $name = $m->getAttribute('http-equiv'); if (length $name) { my $nsname = $this->_pfx2fq('HTTP11.'.$name); push @{ $this->{Meta}->{$nsname} }, { data => $m->getAttribute('content'), scheme => $this->_pfx2fq($m->getAttribute('scheme')), 'http-equiv' => $name, _meta => $name, _xmeta => $nsname, _node => $m, _tag => $m->getTagName, _attr => 'http-equiv' }; } else { $name = $m->getAttribute('name'); next unless (length $name); $name =~ s/(^\s+|\s+$)//g; $name =~ s/\s+/ /g; my @name = split / /, $name; foreach my $n (@name) { my $nsname = $this->_pfx2fq($n, undef, $m); push @{ $this->{Meta}->{$nsname} }, { data => $m->getAttribute('content'), lang => $m->getAttribute('lang'), scheme => $this->_pfx2fq($m->getAttribute('scheme')), name => $name, _meta => $n, _xmeta => $nsname, _node => $m, _tag => $m->getTagName, _attr => 'name' }; } } } # Parse HTTP headers. my @headers = $this->{RESPONSE}->header_field_names; foreach my $h (@headers) { my $nsname = $this->_pfx2fq('HTTP11.'.$h); my @vals = $this->{RESPONSE}->header($h); foreach my $v (@vals) { push @{ $this->{Meta}->{$nsname} }, { data => $v, 'http-equiv' => $h, _meta => $h, _xmeta => $nsname, _tag => '_http_header' }; } } # Parse <title>. foreach my $t ($this->{DOM}->getElementsByTagName('title')) { push @{ $this->{Meta}->{'title'} }, { data => Cognition::HTMLParser::Utils::STRINGIFY($t), lang => $t->getAttribute('lang'), _meta => 'title', _xmeta => 'title', _node => $t, _tag => $t->getTagName }; } # Parse HTML role="" attribute, filling _element, data, url (if _element=='a'), lang # Parse eRDF <URL:http://research.talis.com/2005/erdf/wiki/Main/RdfInHtml>. # We do not support the specialised "rdf:type" notation though. foreach my $e ($this->{DOM}->getElementsByTagName('*')) { my $role = $e->getAttribute('role'); if (length $role) { $role =~ s/(^\s+|\s+$)//g; $role =~ s/\s+/ /g; my @role = split / /, $role; foreach my $r (@role) { my $nsrole = $this->_pfx2fq($r, undef, $e); push @{ $this->{Meta}->{$nsrole} }, { title => $e->getAttribute('title'), lang => $e->getAttribute('lang'), href => $this->abs_url($e->getAttribute('href')), data => Cognition::HTMLParser::Utils::STRINGIFY($e), role => $role, _meta => $r, _xmeta => $nsrole, _node => $e, _tag => $e->getTagName, _attr => 'role' }; } } $role = $e->getAttribute('class'); if (length($role)) { $role =~ s/(^\s+|\s+$)//g; $role =~ s/\s+/ /g; my @role = split / /, $role; foreach my $r (@role) { my $nsrole = $this->_pfx2fq($r, '-', $e); next if ($nsrole eq lc($r)); # Only allow namespaced classes. my $parent = $e; my $subject = undef; while (!defined $subject && $parent->getNodeType==ELEMENT_NODE) { if ($parent->getAttribute('id')) { $subject = $parent; } else { $parent = $parent->getParentNode(); } } push @{ $this->{Meta}->{$nsrole} }, { title => $e->getAttribute('title'), lang => $e->getAttribute('lang'), href => $this->abs_url($e->getAttribute('href')), data => $e->getAttribute('title') || Cognition::HTMLParser::Utils::STRINGIFY($e), class => $role, _meta => $r, _xmeta => $nsrole, _node => $e, _tag => $e->getTagName, _attr => 'class', _subject=> $parent->getNodeType==ELEMENT_NODE ? ('#' . $parent->getAttribute('id')) : '' }; } } } } #/sub parse_metadata sub _pfx2fq { my $this = shift; my $key = shift; my $mode = shift; my $dom = shift; my ($ns, $term, $nsurl); $mode = '.:' unless length $mode; # In "dash mode" allows the hyphen to be used as a namespace # prefix separator. This is for eRDF. if (($mode eq '-' && $key =~ /^([A-Z_][A-Z0-9_]*)[\.\:-](.+)$/i) # Normally, dash is allowed as part of the namespace prefix. || ($mode eq '.:' && $key =~ /^([A-Z_][A-Z0-9_-]*)[\.\:](.+)$/i)) { $ns = $1; $term = $2; $nsurl = $this->{metadata_ns}->{uc($ns)}->{nsurl}; # If we haven't found the namespace yet, then check xmlns:* attributes. if ($dom && !length $nsurl) { $nsurl = Cognition::HTMLParser::Utils::xmlns($ns, $dom); $this->{xmlns}->{$ns} = { nsurl => $nsurl, xmlns => 1 }; } # In eRDF, cope with undefined prefixes by assuming that # the class name was never intended as eRDF. if ($mode eq '-' && !$nsurl) { return $key; } # Otherwise, if an undefined prefix is used, complain quietly. $nsurl = 'http://undefined-namespace-prefix.invalid/' if (!$nsurl); return '<'.$nsurl.$term.'>'; } # If no valid prefix is found, treat as case-insensitive default namespace return lc($key); } #/sub _pfx2fq sub _fq2pfx { my $this = shift; my $key = shift; my $mode = shift || ':'; $key =~ s/(^\<|\>$)//g; $key =~ /^(.*[\#\/])([^\#\/]+)$/; my $nsurl = $1; $key = $2; my $pfx; foreach my $k (keys %{ $this->{metadata_ns} }) { if ($this->{metadata_ns}->{$k}->{nsurl} eq $nsurl) { $pfx = lc($k); return $pfx.$mode.$key; } } foreach my $k (keys %{ $this->{xmlns} }) { if ($this->{xmlns}->{$k}->{nsurl} eq $nsurl) { $pfx = $k; return $pfx.$mode.$key; } } return $key; } #/sub _fq2pfx sub abs_url { my $this = shift; my $url = url shift, $this->{BASE}; return $url->abs->as_string; } #/sub abs_url sub parse_uformats { my $this = shift; my (@tmp1, @tmp2, @tmp3, @tmp4, @tmp5, @tmp6); (@tmp1 = Cognition::HTMLParser::hCard::parse_all($this)) && ($this->{uF}->{hCard} = \@tmp1); (@tmp2 = Cognition::HTMLParser::adr::parse_all($this)) && ($this->{uF}->{adr} = \@tmp2); (@tmp3 = Cognition::HTMLParser::geo::parse_all($this)) && ($this->{uF}->{geo} = \@tmp3); (@tmp4 = Cognition::HTMLParser::Rel::Tag::parse_all($this)) && ($this->{uF}->{Tag} = \@tmp4); (@tmp5 = Cognition::HTMLParser::hCalendar::parse_all($this)) && ($this->{uF}->{hCalendar} = \@tmp5); (@tmp6 = Cognition::HTMLParser::Rel::License::parse_all($this)) && ($this->{uF}->{License} = \@tmp6); # hAtom # rel-license + CC license # rel-enclosure } #/sub parse_uformats sub parse_doc_structure { my $this = shift; my $rv = { kids=>() }; my $lvl = 1; my $ptr = $rv; my $count_lists; my $count_tables; my $count_figures; # Create my own pseudo-uF: list of semantic tables (i.e. having <caption> or summary) # Create my own pseudo-uF: list of figures (i.e. <img class="figure" title="blah">) # Create my own pseudo-uF: list of semantic lists (i.e. <ul class="xoxo">) # Parse heading structure, including figures, tables and lists} foreach my $e ($this->{DOM}->getElementsByTagName('*')) { my $tag = $e->getTagName; if ($tag eq 'figure' || $e->getAttribute('class') =~ / (^|\s) figure (\s|$) /x) { my $node = {}; push @{ $ptr->{kids}->[-1]->{kids} }, $node; $node->{tag} = 'figure'; $node->{count} = ++$count_figures; $node->{figure} = Cognition::HTMLParser::figure::parse($this, $e); $node->{_dom} = $e; } elsif ($e->getAttribute('class') =~ / (^|\s) xoxo (\s|$) /x) { my $node = {}; push @{ $ptr->{kids}->[-1]->{kids} }, $node; $node->{tag} = 'xoxo'; $node->{count} = ++$count_lists; $node->{figure} = Cognition::HTMLParser::xoxo::parse($this, $e); $node->{_dom} = $e; } elsif ($tag =~ /^h([1-6])$/i) { my $thisLevel = $1; my $node = {}; while ($thisLevel > $lvl) { my $parent = $ptr->{kids}->[-1]; $parent->{up} = $ptr; $ptr = $parent; $lvl++; } while ($thisLevel < $lvl) { $lvl--; $ptr = $ptr->{up}; } if ($thisLevel == $lvl) { push @{ $ptr->{kids} }, $node; } $node->{tag} = $tag; $node->{_dom} = $e; $node->{heading} = Cognition::HTMLParser::Utils::STRINGIFY($e); } elsif ( $tag eq 'table' && (length $e->getAttribute('summary') || defined $e->getElementsByTagName('caption')->item(0)) ) { my $node = {}; push @{ $ptr->{kids}->[-1]->{kids} }, $node; $node->{tag} = $tag; $node->{_dom} = $e; $node->{summary} = $e->getAttribute('summary'); $node->{count} = ++$count_tables; my @captions = $e->getElementsByTagName('caption'); if (@captions) { $node->{caption} = Cognition::HTMLParser::Utils::STRINGIFY($captions[0]); } } } $this->{Structure} = $rv; return $rv; } #/sub parse_doc_structure sub html2xhtml { my $this = shift; my $html = shift; $html =~ s/\<\!DOCTYPE[^>]*\>//i; my $t = HTML::TreeBuilder->new(); $t->implicit_tags(1); $t->p_strict(1); $t->parse_content($html); $t->{'xmlns'} = 'http://www.w3.org/1999/xhtml'; my $xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . "<!DOCTYPE html\n" . "\tPUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" . "\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" . $t->as_XML(undef, "\t", { 'area'=>1, 'base'=>1, 'basefont'=>1, 'br'=>1, 'col'=>1, 'frame'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'link'=>1, 'meta'=>1, 'param'=>1 }); return $xhtml; } #/sub html2xhtml sub searchClass { my $this = shift; my $target = shift; my $dom = shift || $this->{DOM}; my $nodeList = $dom->getElementsByTagName('*'); my @matches = (); for (my $i = 0; $i < $nodeList->getLength; $i++) { my $node = $nodeList->item($i); my $classList = $node->getAttribute('class'); next unless (length $classList); if ($classList =~ / (^|\s) $target (\s|$) /x) { push @matches, $node; } } return @matches; } #/sub searchClass sub searchRel { my $this = shift; my $target = shift; my $dom = shift || $this->{DOM}; $target =~ s/[\:\.]/\[\:\.\]/; my $nodeList = $dom->getElementsByTagName('*'); my @matches = (); for (my $i = 0; $i < $nodeList->getLength; $i++) { my $node = $nodeList->item($i); my $classList = $node->getAttribute('rel'); next unless (length $classList); if ($classList =~ / (^|\s) $target (\s|$) /ix) { push @matches, $node; } } return @matches; } #/sub searchRel sub searchID { my $this = shift; my $target = shift; my $dom = shift || $this->{DOM}; my $nodeList = $dom->getElementsByTagName('*'); my @matches = (); for (my $i = 0; $i < $nodeList->getLength; $i++) { my $node = $nodeList->item($i); my $id = $node->getAttribute('id') || next; if ($id eq $target) { return $node; } } } #/sub searchID 1; #/package Cognition::HTMLParser ###################################################################### package Cognition::HTMLParser::Utils; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub STRINGIFY { my $domNode = shift; my $withClass = shift || undef; my $inClass = shift || 0; if (defined $withClass && !$inClass && $domNode->getNodeType == ELEMENT_NODE) { my $c = $domNode->getAttribute('class'); $inClass = 1 if ($c =~ / (^|\s) $withClass (\s|$) /x); } if ($inClass || !defined $withClass) { if ($domNode->getNodeType == TEXT_NODE) { return $domNode->getData; } elsif ($domNode->getNodeType == ELEMENT_NODE && $domNode->getTagName eq 'img') { return $domNode->getAttribute('alt'); } elsif ($domNode->getNodeType == ELEMENT_NODE) { my @parts; foreach my $child ($domNode->getChildNodes) { push @parts, STRINGIFY($child, $withClass, $inClass); } return join(' ', @parts); } } } sub xmlns { my $ns = shift; my $dom = shift; while ($dom->getNodeType==ELEMENT_NODE) { if (length $dom->getAttribute("xmlns:$ns")) { return $dom->getAttribute("xmlns:$ns"); } $dom = $dom->getParentNode; } if ($dom->getNodeType==DOCUMENT_NODE) { if (length $dom->getDocumentElement->getAttribute("xmlns:$ns")) { return $dom->getDocumentElement->getAttribute("xmlns:$ns"); } } return undef; } 1; #/package Cognition::HTMLParser::Utils ###################################################################### package Cognition::HTMLParser::Microformats; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub ufp_abbr # Implements the microformats ABBR pattern. { my $node = shift; my $page = shift; my $elem = shift || 'abbr'; my $attr = shift || 'title'; my $rv = 0; my $abbrs = $node->getElementsByTagName('*'); for (my $i=0; $i < $abbrs->getLength; $i++) { # For each ABBR element... my $abbr = $abbrs->item($i); # Skip pattern if no title attribute found. my $title = $abbr->getAttribute($attr) || next; # ABBR element (or other element specified as third parameter) should # *always* be expanded. Other elements should only be expanded if the # title attribute (or other attribute, fourth parameter) includes the # string 'data:'. next unless ($abbr->getTagName eq $elem || $abbr->getAttribute($attr) =~ /data\:/); # Implement Andy Mabbett's suggested "data:" prefix if ($title =~ / [\(\[] data\: (.*) [\)\]] /x || $title =~ / data\: (.*) $ /x ) { $title = $1; } # Remove child nodes my $kids = $abbr->getChildNodes; for (my $j=0; $j < $kids->getLength; $j++) { $abbr->removeChild($kids->item($j)); } # Insert contents of title attribute $abbr->addText($title) && $rv++; } # Return number of replacements made. return $rv; } # /sub ufp_abbr sub ufp_include # Implements the standard microformats include pattern. { my $node = shift; my $page = shift; my $class = shift || 'include'; my $rv = 0; my $links = $node->getElementsByTagName('a'); for (my $i=0; $i < $links->getLength; $i++) { # For each link... my $link = $links->item($i); # Skip pattern if no class attribute found. my $classList = $link->getAttribute('class') || next; # We've found a use of the include pattern if ($classList =~ / (^|\s) $class (\s|$) /x) { my $href = $link->getAttribute('href'); my $id = undef; if ($href =~ /^\#(.*)$/) { $id = $1; } else { next; } my $replacement = $page->searchID($id)->cloneNode(1) || next; $link->getParentNode->replaceChild($replacement, $link) && $rv++; } } # Return number of replacements made. return $rv; } # /sub ufp_include sub ufp_include2 # Implements the alternative microformats include pattern. { my $node = shift; my $page = shift; my $classpfx = shift || '#'; my $rv = 0; # For each element... foreach my $elem ($node->getElementsByTagName('*')) { # Skip pattern if no class attribute found. my $classList = $elem->getAttribute('class'); next unless ($classList =~ / $classpfx /x); my $atEnd = 0; $classList =~ s/(^\s|\s$)//g; $classList =~ s/\s+/ /g; my @classes = split / /, $classList; my @newClassList = (); foreach my $c (@classes) { if (substr($c,0,1) ne $classpfx && length($c)>1) { push @newClassList, $c; $atEnd = 1; next; } my $id = $c; $id =~ s/^\#//x; my $replacement = $page->searchID($id) || next; $replacement = $replacement->cloneNode(1); if ($atEnd) { $elem->appendChild($replacement) && $rv++; } else { $elem->insertBefore($replacement, $elem->getFirstChild) && $rv++; } } $elem->setAttribute('class', join(' ', @newClassList)); } # Return number of replacements made. return $rv; } # /sub ufp_include2 1; #/package Cognition::HTMLParser::Microformats ###################################################################### package Cognition::HTMLParser::hCard; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub MAX_INCLUDE_ITERATIONS { 6; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('vcard', $within); my @rv; foreach my $card (@nodes) { my $hcard = parse($page, $card); push @rv, $hcard; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $hcard_node = $rv->{'_dom'}->cloneNode(1); my $id = $hcard_node->getAttribute('id'); $rv->{_id} = $id if (length $id); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::HTMLParser::Microformats::ufp_include($hcard_node, $page) + Cognition::HTMLParser::Microformats::ufp_include2($hcard_node, $page); $incl_iterations++; } Cognition::HTMLParser::Microformats::ufp_abbr($hcard_node, $page); # Pull out nested hcards to prevent them being parsed as within this hcard. # Check to see if they have an 'agent' class, and if so, parse them as an agent. my @nested = $page->searchClass('vcard', $hcard_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) agent (\s|$) /x) { push @{ $rv->{agent} }, parse($page, $h); } $h->getParentNode->removeChild($h); } # Categories my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $hcard_node); $rv->{category} = \@tags unless (!@tags); # Pull out embedded addresses. my @adrs = Cognition::HTMLParser::adr::parse_all($page, $hcard_node); $rv->{adr} = \@adrs unless (!@adrs); # Pull out embedded locations. my @geos = Cognition::HTMLParser::geo::parse_all($page, $hcard_node); $rv->{geo} = \@geos unless (!@geos); # Parse organisations. my @orgs = $page->searchClass('org', $hcard_node); foreach my $org (@orgs) { my @orgnames = $page->searchClass('organization-name', $org); my @orgunits = $page->searchClass('organization-unit', $org); my $parsed_org = {}; if (!@orgnames) { $parsed_org = { 'organisation-unit' => undef, 'organisation-name' => Cognition::HTMLParser::Utils::STRINGIFY($org) }; } else { my $orgname = $orgnames[0]; my $orgunit = $orgunits[0]; $parsed_org = { 'organisation-unit' => Cognition::HTMLParser::Utils::STRINGIFY($orgname), 'organisation-name' => Cognition::HTMLParser::Utils::STRINGIFY($orgunit) }; } push @{ $rv->{org} }, $parsed_org; } # Simple attributes - singles my @simples = qw(fn bday tz sort-string uid class family-name given-name additional-name honorific-prefix honorific-suffix); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $hcard_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); if ($simple eq 'uid') { if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area') { $value = $value_node->getAttribute('href'); } elsif ($value_node->getTagName eq 'img') { $value = $value_node->getAttribute('src'); } elsif ($value_node->getTagName eq 'object') { $value = $value_node->getAttribute('data'); } } $rv->{$simple} = $value; } # Simple attributes - possible duplicates my @simples = qw(nickname url label title role photo sound key logo mailer category note rev agent); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $hcard_node); foreach my $value_node (@value_nodes) { next unless ($value_node); my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); if ($simple =~ /^(url|photo|logo|sound|key)$/) { if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area') { $value = $value_node->getAttribute('href'); } elsif ($value_node->getTagName eq 'img') { $value = $value_node->getAttribute('src'); } elsif ($value_node->getTagName eq 'object') { $value = $value_node->getAttribute('data'); } } push @{$rv->{$simple}}, $value; } } # Telephone numbers and e-mail addresses my @comms = ('tel', 'email'); foreach my $comm (@comms) { my @nodes = $page->searchClass($comm, $hcard_node); foreach my $n (@nodes) { my @accumValue; my @accumType; foreach my $v ($page->searchClass('value', $n)) { if ($v->getTagName eq 'a' || $v->getTagName eq 'area') { my $href = $v->getAttribute('href'); $href =~ s/(^[a-z]+\:|\?.*$)//g; push @accumValue, $href; } elsif ($v) { push @accumValue, Cognition::HTMLParser::Utils::STRINGIFY($v); } } foreach my $v ($page->searchClass('type', $n)) { push @accumType, uc(Cognition::HTMLParser::Utils::STRINGIFY($v)); $v->getParentNode->removeChild($v); } my $val = join '', @accumValue; if ($val !~ /[0-9a-z]/i) { $val = Cognition::HTMLParser::Utils::STRINGIFY($n); } push @{ $rv->{$comm} }, { 'value' => $val, 'type' => \@accumType }; } } # Check to see if this contact card is for an individual contact or an org. # If for an org, disable n-optimisation. my $do_n_opt = 1; $rv->{_type} = 'contact'; foreach my $org (@{ $rv->{org} }) { if (lc($org->{'organisation-name'}) eq lc($rv->{fn})) { $rv->{_type} = 'organisation'; $do_n_opt = 0; last; } } # Perform n-optimisation. if ($do_n_opt) { foreach my $f (qw(family-name given-name additional-name honorific-prefix honorific-suffix)) { if (length $rv->{$f}) { $do_n_opt = 0; } } if ($do_n_opt) { $rv->{fn} =~ s/(^\s|\s$)//g; $rv->{fn} =~ s/\s+/ /g; my @words = split / /, $rv->{fn}; if (scalar @words == 1) { $rv->{nickname} = $words[0]; } elsif (scalar @words) { if ($words[0] =~ /^.*\,$/ || $words[1] =~ /^.\.?$/) { $words[0] =~ s/[\.\,]$//; $words[1] =~ s/[\.\,]$//; $rv->{'given-name'} = $words[1]; $rv->{'family-name'} = $words[0]; } else { $rv->{'given-name'} = $words[0]; $rv->{'family-name'} = $words[-1]; } } } } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::hCard ###################################################################### package Cognition::HTMLParser::hCalendar; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub MAX_INCLUDE_ITERATIONS { 2; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('hcalendar', $within); my @rv; if (!@nodes && $within->getNodeType==DOCUMENT_NODE) { @nodes = ($within->getDocumentElement); } elsif (!@nodes) { @nodes = ($within); } foreach my $hc (@nodes) { my $hc_parsed = parse($page, $hc); push @rv, $hc_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $hc_node = $rv->{'_dom'}->cloneNode(1); my $id = $hc_node->getAttribute('id'); $rv->{_id} = $id if (length $id); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::HTMLParser::Microformats::ufp_include($hc_node, $page) + Cognition::HTMLParser::Microformats::ufp_include2($hc_node, $page); $incl_iterations++; } Cognition::HTMLParser::Microformats::ufp_abbr($hc_node, $page); # Pull out nested hevents. my @events = Cognition::HTMLParser::hCalendarEvent::parse_all($page, $hc_node); $rv->{vevent} = \@events unless (!@events); # Simple attributes - singles my @simples = qw(version prodid); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $hc_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); $rv->{$simple} = $value; } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::hCalendar ###################################################################### package Cognition::HTMLParser::hCalendarEvent; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub MAX_INCLUDE_ITERATIONS { 6; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('vevent', $within); my @rv; foreach my $ev (@nodes) { push @rv, parse($page, $ev); } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $event_node = $rv->{'_dom'}->cloneNode(1); my $id = $event_node->getAttribute('id'); $rv->{_id} = $id if (length $id); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::HTMLParser::Microformats::ufp_include($event_node, $page) + Cognition::HTMLParser::Microformats::ufp_include2($event_node, $page); $incl_iterations++; } Cognition::HTMLParser::Microformats::ufp_abbr($event_node, $page); # Pull out nested hcards to prevent them being parsed as within this event. # Check to see if they have a 'useful' class, and if so, parse them in. my @nested = $page->searchClass('vcard', $event_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) organiser (\s|$) /x) { push @{ $rv->{organiser} }, Cognition::HTMLParser::hCard::parse($page, $h); } if ($h->getAttribute('class') =~ / (^|\s) contact (\s|$) /x) { push @{ $rv->{contact} }, Cognition::HTMLParser::hCard::parse($page, $h); } if ($h->getAttribute('class') =~ / (^|\s) attendee (\s|$) /x) { push @{ $rv->{attendee} }, Cognition::HTMLParser::hCard::parse($page, $h); } if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x) { push @{ $rv->{location} }, Cognition::HTMLParser::hCard::parse($page, $h); } $h->getParentNode->removeChild($h); } # Ditto for addresses my @nested = $page->searchClass('adr', $event_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x) { push @{ $rv->{location} }, Cognition::HTMLParser::adr::parse($page, $h); } $h->getParentNode->removeChild($h); } # Ditto for geo my @nested = $page->searchClass('geo', $event_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x) { push @{ $rv->{location} }, Cognition::HTMLParser::geo::parse($page, $h); } else { push @{ $rv->{geo} }, Cognition::HTMLParser::geo::parse($page, $h); } $h->getParentNode->removeChild($h); } # Categories my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $event_node); $rv->{category} = \@tags unless (!@tags); # Simple attributes - singles my @simples = qw(class description dtend dtstart duration location status summary uid url last-modified rdate rrule); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $event_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); if ($simple =~ /^(url|uid)$/) { if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area') { $value = $value_node->getAttribute('href'); } elsif ($value_node->getTagName eq 'img') { $value = $value_node->getAttribute('src'); } elsif ($value_node->getTagName eq 'object') { $value = $value_node->getAttribute('data'); } } $rv->{$simple} = $value; } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::hCalendarEvent ###################################################################### package Cognition::HTMLParser::adr; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub MAX_INCLUDE_ITERATIONS { 2; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('adr', $within); my @rv; foreach my $adr (@nodes) { my $adr_parsed = parse($page, $adr); push @rv, $adr_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $adr_node = $rv->{'_dom'}->cloneNode(1); my $id = $adr_node->getAttribute('id'); $rv->{_id} = $id if (length $id); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::HTMLParser::Microformats::ufp_include($adr_node, $page) + Cognition::HTMLParser::Microformats::ufp_include2($adr_node, $page); $incl_iterations++; } Cognition::HTMLParser::Microformats::ufp_abbr($adr_node, $page); # Simple attributes - possible duplicates my @simples = qw(post-office-box extended-address street-address locality region postal-code country-name type); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $adr_node); foreach my $value_node (@value_nodes) { my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); push @{$rv->{$simple}}, $value; } } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::adr ###################################################################### package Cognition::HTMLParser::geo; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub MAX_INCLUDE_ITERATIONS { 2; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('geo', $within); my @rv; foreach my $geo (@nodes) { my $geo_parsed = parse($page, $geo); push @rv, $geo_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $geo_node = $rv->{'_dom'}->cloneNode(1); my $id = $geo_node->getAttribute('id'); $rv->{_id} = $id if (length $id); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::HTMLParser::Microformats::ufp_include($geo_node, $page) + Cognition::HTMLParser::Microformats::ufp_include2($geo_node, $page); $incl_iterations++; } Cognition::HTMLParser::Microformats::ufp_abbr($geo_node, $page); # Simple attributes - singles my @simples = qw(longitude latitude body reference-frame altitude); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $geo_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); $rv->{$simple} = $value; } if (!length($rv->{longitude}) || !length($rv->{latitude})) { my $str = $geo_node->toString; if ($geo_node->getTagName eq 'img' || $geo_node->getTagName eq 'area') { $str = $geo_node->getAttribute('alt'); } $str =~ s/\,/\./g; if ($str =~ / \s* (\-?[0-9\.]+) \s* \; \s* (\-?[0-9\.]+) \s* /x) { $rv->{latitude} = $1; $rv->{longitude} = $2; } } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::geo ###################################################################### package Cognition::HTMLParser::figure; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub MAX_INCLUDE_ITERATIONS { 2; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('figure', $within); my @rv; foreach my $figure (@nodes) { my $figure_parsed = parse($page, $figure); push @rv, $figure_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $figure_node = $rv->{'_dom'}->cloneNode(1); $rv->{_dom} = undef; my $id = $figure_node->getAttribute('id'); $rv->{_id} = $id if (length $id); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::HTMLParser::Microformats::ufp_include($figure_node, $page) + Cognition::HTMLParser::Microformats::ufp_include2($figure_node, $page); $incl_iterations++; } Cognition::HTMLParser::Microformats::ufp_abbr($figure_node, $page); # Extract embedded hCards, which may be used as credits my @nested = $page->searchClass('hcard', $figure_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) credit (\s|$) /x) { push @{ $rv->{credit} }, Cognition::HTMLParser::hcard::parse($page, $h); } $h->getParentNode->removeChild($h); } # Additional credits my @value_nodes = $page->searchClass('credit', $figure_node); foreach my $value_node (@value_nodes) { my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); push @{ $rv->{'credit'} }, $value; } # Legend my @value_nodes = $page->searchClass('legend', $figure_node); my $value_node = $value_nodes[0]; if ($value_node) { my $value = Cognition::HTMLParser::Utils::STRINGIFY($value_node); $value = $value_node->getAttribute('title') if ($value_node->getTagName eq 'img'); $rv->{'legend'} = $value; } # Categories my @tags = Cognition::HTMLParser::Rel::Tag::parse_all($page, $figure_node); $rv->{category} = \@tags unless (!@tags); # Find image my $img; if ($figure_node->getTagName eq 'img') { $img = $figure_node; } else { my @images = $figure_node->getElementsByTagName('img'); $img = $images[0]; } $rv->{image} = $page->abs_url($img->getAttribute('src')); $rv->{title} = $img->getAttribute('title') unless (length $rv->{'legend'}); bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::figure ###################################################################### package Cognition::HTMLParser::Rel::Tag; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchRel('tag', $within); my @rv; foreach my $t (@nodes) { my $t_parsed = parse($page, $t); push @rv, $t_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $t_node = $rv->{'_dom'}->cloneNode(1); my $id = $t_node->getAttribute('id'); $rv->{_id} = $id if (length $id); my $ts; my $tag = $t_node->getAttribute('href'); $tag =~ s/\#.*$//; $tag =~ s/\?.*$//; $tag =~ s/\/$//; if ($tag =~ m{^(.*/)([^/]+)$}) { $rv->{tagspace} = $1; $rv->{tag} = $2; } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::Rel::Tag ###################################################################### package Cognition::HTMLParser::Rel::License; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.1; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @rv; # Unqualified 'license' rel. my @nodes = $page->searchRel('license', $within); # Various namespace-qualified alternatives... push @nodes, $page->searchRel($page->_fq2pfx('http://web.resource.org/cc/license'), $within); push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/terms/license'), $within); push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/terms/rights.license'), $within); push @nodes, $page->searchRel($page->_fq2pfx('http://purl.org/dc/elements/1.1/rights.license'), $within); foreach my $l (@nodes) { my $l_parsed = parse($page, $l); push @rv, $l_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $t_node = $rv->{'_dom'}->cloneNode(1); $rv->{'license'} = $t_node->getAttribute('href'); bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::Rel::License ###################################################################### package Cognition::HTMLParser::xoxo; ###################################################################### use XML::DOM; use XML::XOXO; use strict; BEGIN { our $VERSION = 0.1; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @rv; my @nodes = $page->searchClass('xoxo', $within); foreach my $l (@nodes) { my $l_parsed = parse($page, $l); push @rv, $l_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $list_node = $rv->{'_dom'}->cloneNode(1); my $parser = XML::XOXO::Parser->new(); $rv->{'object'} = $parser->parse($list_node->toString); bless $rv; return $rv; } #/sub parse 1; #/package Cognition::HTMLParser::xoxo ###################################################################### package main; ###################################################################### use strict; use CGI qw(:standard); use CGI::Carp 'fatalsToBrowser'; use Data::Dumper; use LWP::UserAgent; use LWP::RobotUA; use HTTP::Request; my $q = new CGI; if ($q->request_method =~ /^(GET|POST|HEAD)$/) { print &header; print "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n"; print "<title>cognition/0.1-alpha2\n"; print "\n"; print "

cognition/0.1-alpha2

\n"; print "
\n"; print "
\n"; print " Analyse semantics of a page\n"; print " \n"; print " \n"; print " \n"; print "
\n"; print "
\n"; my $url = $q->param('url'); if (length $url) { print "
\n";

		my $ua = LWP::RobotUA->new(
			agent      => 'cognition/0.1-alpha2 ',
			from       => 'invalid@invalid.invalid',
			delay      => 0
		);

		my $request  = HTTP::Request->new(GET => $url);
		my $response = $ua->request($request);
		my $x        = Cognition::HTMLParser->new(
				$response->content, 
				{
					ua        => $ua,
					response  => $response,
					request   => $request
				}
			);
		print escapeHTML($x->dumpTextClean);
		print "
\n"; } else { print "

No URL entered!

\n"; } } else { my $ua = LWP::UserAgent->new( agent => 'cognition/0.1-alpha2 ' ); my $url = shift @ARGV; my $request = HTTP::Request->new(GET => $url); my $response = $ua->request($request); my $x = Cognition::HTMLParser->new( $response->content, { ua => $ua, response => $response, request => $request } ); print $x->dumpTextClean; }