#!/usr/bin/perl # Cognition - A metadata browser # Copyright (c) 2008 Toby Inkster. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ###################################################################### package Cognition::Parser; ###################################################################### use Data::Dumper; use HTML::Entities; use HTML::TreeBuilder; use HTTP::Request; use Sort::ArrayOfArrays; use URI::URL; use XML::DOM; use XML::Parser; use XML::XPath; use strict; BEGIN { our $VERSION = 0.10; our $XDP = new XML::DOM::Parser('NoLWP' => 1); } sub new { my $self = shift; my $source = shift; my $opts = shift; my $this = { 'UA' => $opts->{ua}, 'REQUEST' => $opts->{request}, 'RESPONSE' => $opts->{response} }; $this->{RESPONSE}->header('content-type') =~ /(xml)/i; my $xml = (length $1) ? 1 : 0; $this->{BASE} = $this->{RESPONSE}->base(); if ($xml) { $this->{_xml} = 1; $this->{_source} = $source; } else { $this->{_xml} = 0; $this->{_source} = $self->html2xhtml($source); $this->{_raw_source} = $source; } my $xpp = XML::Parser->new( ErrorContext => 2, # ParseParamEnt => 1, NoLWP => 1 ); $xpp->setHandlers( ExternEnt => 0, ExternEntFin => 0 ); $this->{DOM} = $Cognition::Parser::XDP->parse($this->{_source}); $this->{XP} = XML::XPath->new(xml => $this->{_source}, parser=>$xpp); bless($this); $this->parse_metadata(); $this->parse_uformats(); $this->parse_doc_structure(); return $this; } #/sub new sub dumpText { my $this = shift; return Dumper($this); } #/sub dumpText sub dumpTextClean { my $this = shift; my $tmp = {}; $tmp->{DOM} = $this->{DOM}; $this->{DOM} = '...hidden...'; $tmp->{XP} = $this->{XP}; $this->{XP} = '...hidden...'; $Data::Dumper::Sortkeys = \&_cleanKeys; my $txt = $this->dumpText; $this->{DOM} = $tmp->{DOM}; $this->{XP} = $tmp->{XP}; return $txt; } #/sub dumpTextClean sub _cleanKeys { my $hash = shift; my @keys = sort grep {/^[a-zA-Z]/} keys %{ $hash }; return \@keys; } sub quoted { return '"'.(shift).'"'; } #/sub quoted sub dumpRDF { my $this = shift; my $triples = $this->triples; my %lookup; my $rv = "\n" . "{_metadata_ns} }) { my $nsurl = $this->{_metadata_ns}->{$ns}->{nsurl}; next unless (length $nsurl); $lookup{$nsurl} = lc($ns); $rv .= "\n\txmlns:".lc($ns)."=\"".encode_entities($nsurl)."\""; } $rv .= ">\n\n"; my $current_target = undef; for (my $i=0; defined $triples->[$i]; $i++) { my ($target, $key, $value) = @{ $triples->[$i] }; my $stripped_target; if ($target ne $current_target) { if (defined $current_target) { $rv .= "\t\n\n"; } $current_target = $target; $stripped_target = $target; $stripped_target =~ s/(^\<|\>$)//g; $rv .= "\t\n" } my $tag = $this->_fq2pfx($key); if ($value =~ /^\$)//g; $rv .= "\t\t<$tag rdf:resource=\"".encode_entities($value)."\" />\n"; } else { $value =~ s/(^\"|\"$)//g; $rv .= "\t\t<$tag>".encode_entities($value)."\n"; } } if (defined $current_target) { $rv .= "\t\n\n"; } $rv .= "\n"; return $rv; } #/sub dumpRDF sub dumpTriples { my @triples = shift->triples; return Dumper(\@triples); } #/sub dumpTriples sub triples { my $this = shift; my @rv; my %RV; my @metas = keys %{ $this->{Meta} }; foreach my $m (@metas) { for (my $i=0; defined $this->{Meta}->{$m}->[$i]; $i++) { my $M = $this->{Meta}->{$m}->[$i]; my ($target, $value); $target = $M->{_target} || ''; $target = "<$target>"; if ($M->{_attr} eq 'rel') { $value = '<'.$M->{href}.'>'; if ($M->{title}) { $RV{$value}{''}{quoted($M->{title})} = 1; } } elsif ($M->{_attr} eq 'rev') { $value = '<>'; if ($M->{title}) { $RV{$value}{''}{quoted($M->{title})} = 1; } } else { $value = quoted($M->{data}); } $RV{$target}{$m}{$value} = 1 if ($m =~ /^\{$j} }) { push @rv, [($i, $j, $k)]; } } } my $sort = Sort::ArrayOfArrays->new( sort_column => '0,1,2', results => \@rv ); return $sort->sort_it; } #/sub triples sub parse_rfc2731_namespaces # Discover namespaces for metadata conforming to RFC 2731. Note # that eRDF uses the same format for metadata schemas, so we # can re-use these for eRDF too. { my $this = shift; # We'll start by defining some namespaces for metadata. # The document is free to redefine them, but it seems to # work better if we define these by default. $this->{_metadata_ns} = { # Some people forget to manually specify these 'DC' => { nsurl => 'http://purl.org/dc/elements/1.1/', title => 'Dublin Core 1.1' }, 'DCTERMS' => { nsurl => 'http://purl.org/dc/terms/', title => 'Dublin Core Terms' }, # Pseudo-schema for . 'HTTP11' => { nsurl => 'urn:ietf:rfc:2616#', title => 'Hypertext Transfer Protocol 1.1' }, # Metaschema for RFC2731 itself 'SCHEMA' => { nsurl => 'urn:ietf:rfc:2731#', title => "The 'Schema' Metaschema" }, # Defined by Atom, not strictly RFC2731-compliant, but can be # treated as such. (service.feed, service.edit, service.post) 'SERVICE' => { nsurl => 'http://www.w3.org/2007/app#', title => 'Atom Publishing Protocol' }, # Defined by OpenID, not strictly RFC2731-compliant, but can be # treated as such. (openid.server, openid.delegate) 'OPENID' => { nsurl => 'http://openid.net/specs/openid-authentication-1_1.html#', title => 'OpenID 1.1' } }; # Find custom namespaces. FOAF and other stuff goes here. my $links = $this->{XP}->find('//link'); foreach my $l ($links->get_nodelist) { my $rel = $l->getAttribute('rel'); if ($rel =~ /^schema[\.\:]([A-Z_][A-Z0-9_-]*)$/i) { my $prefix = uc($1); my $url = $this->abs_url($l->getAttribute('href')); my $title = $l->getAttribute('title'); $this->{_metadata_ns}->{$prefix}->{nsurl} = $url; $this->{_metadata_ns}->{$prefix}->{title} = $title || "The '$prefix' Schema"; } } } #/sub parse_rfc2731_namespaces sub parse_metadata { my $this = shift; $this->parse_rfc2731_namespaces; # Parse links my $links = $this->{XP}->find('//*[@rel]'); foreach my $l ($links->get_nodelist) { my $rel = $l->getAttribute('rel'); $rel =~ s/(^\s+|\s+$)//g; $rel =~ s/\s+/ /g; my @rel = split / /, $rel; foreach my $r (@rel) { my $nsrel = $this->_pfx2fq($r); push @{ $this->{Meta}->{$nsrel} }, { title => $l->getAttribute('title'), media => $l->getAttribute('media'), data => $l->getAttribute('href'), href => $this->abs_url($l->getAttribute('href')), type => $l->getAttribute('type'), lang => $l->getAttribute('lang'), _meta => $r, _xmeta => $nsrel, _xpath => $l, _tag => $l->getName, _attr => 'rel' }; } } my $links = $this->{XP}->find('//*[@rev]'); foreach my $l ($links->get_nodelist) { my $rev = $l->getAttribute('rev'); $rev =~ s/(^\s+|\s+$)//g; $rev =~ s/\s+/ /g; my @rev = split / /, $rev; foreach my $r (@rev) { my $nsrev = $this->_pfx2fq($r); push @{ $this->{Meta}->{$nsrev} }, { title => $l->getAttribute('title'), media => $l->getAttribute('media'), data => $l->getAttribute('href'), href => $this->abs_url($l->getAttribute('href')), type => $l->getAttribute('type'), lang => $l->getAttribute('lang'), rev => $rev, _xmeta => $nsrev, _meta => $r, _xpath => $l, _tag => $l->getName, _attr => 'rev', _target=> $this->abs_url($l->getAttribute('href')) }; } } # Parse my $metas = $this->{XP}->find('//meta[@name]'); foreach my $m ($metas->get_nodelist) { my $name = $m->getAttribute('name'); $name =~ s/(^\s+|\s+$)//g; $name =~ s/\s+/ /g; my @name = split / /, $name; foreach my $n (@name) { my $nsname = $this->_pfx2fq($n); push @{ $this->{Meta}->{$nsname} }, { data => $m->getAttribute('content'), lang => $m->getAttribute('lang'), scheme => $this->_pfx2fq($m->getAttribute('scheme')), name => $name, _meta => $n, _xmeta => $nsname, _xpath => $m, _tag => $m->getName, _attr => 'name' }; } } # Parse . my $metas = $this->{XP}->find('//meta[@http-equiv]'); foreach my $m ($metas->get_nodelist) { my $name = $m->getAttribute('http-equiv'); my $nsname = $this->_pfx2fq('HTTP11.'.$name); push @{ $this->{Meta}->{$nsname} }, { data => $m->getAttribute('content'), scheme => $this->_pfx2fq($m->getAttribute('scheme')), 'http-equiv' => $name, _meta => $name, _xmeta => $nsname, _xpath => $m, _tag => $m->getName }; } my @headers = $this->{RESPONSE}->header_field_names; foreach my $h (@headers) { my $nsname = $this->_pfx2fq('HTTP11.'.$h); my @vals = $this->{RESPONSE}->header($h); foreach my $v (@vals) { push @{ $this->{Meta}->{$nsname} }, { data => $v, 'http-equiv' => $h, _meta => $h, _xmeta => $nsname, _tag => '_http_header' }; } } # Parse . my $titles = $this->{XP}->find('//title'); foreach my $t ($titles->get_nodelist) { push @{ $this->{Meta}->{'title'} }, { data => $t->string_value, lang => $t->getAttribute('lang'), _meta => 'title', _xmeta => 'title', _xpath => $t, _tag => $t->getName }; } # Parse HTML role="" attribute, filling _element, data, url (if _element=='a'), lang my $elements = $this->{XP}->find('//*[@role]'); foreach my $e ($elements->get_nodelist) { my $role = $e->getAttribute('role'); $role =~ s/(^\s+|\s+$)//g; $role =~ s/\s+/ /g; my @role = split / /, $role; foreach my $r (@role) { my $nsrole = $this->_pfx2fq($r); push @{ $this->{Meta}->{$nsrole} }, { title => $e->getAttribute('title'), lang => $e->getAttribute('lang'), href => $this->abs_url($e->getAttribute('href')), data => $e->string_value, role => $role, _meta => $r, _xmeta => $nsrole, _xpath => $e, _tag => $e->getName, _attr => 'role' }; } } # Parse eRDF <URL:http://research.talis.com/2005/erdf/wiki/Main/RdfInHtml>. # We do not support the specialised "rdf:type" notation though. my $elements = $this->{XP}->find('//*[@class]'); foreach my $e ($elements->get_nodelist) { my $role = $e->getAttribute('class'); $role =~ s/(^\s+|\s+$)//g; $role =~ s/\s+/ /g; my @role = split / /, $role; foreach my $r (@role) { my $nsrole = $this->_pfx2fq($r, '-'); next if ($nsrole eq lc($r)); # Only allow namespaced classes. my $parent = $e; my $target = undef; while (defined $parent && !defined $target) { if ($parent->getAttribute('id')) { $target = $parent; } else { $parent = $parent->getParentNode(); } } push @{ $this->{Meta}->{$nsrole} }, { title => $e->getAttribute('title'), lang => $e->getAttribute('lang'), href => $this->abs_url($e->getAttribute('href')), data => $e->getAttribute('title') || $e->string_value, class => $role, _meta => $r, _xmeta => $nsrole, _xpath => $e, _tag => $e->getName, _attr => 'class', _target=> '#' . $parent->getAttribute('id') }; } } } #/sub parse_metadata sub _pfx2fq { my $this = shift; my $key = shift; my $mode = shift || '.:'; my ($ns, $term, $nsurl); # In "dash mode" allows the hyphen to be used as a namespace # prefix separator. This is for eRDF. if (($mode eq '-' && $key =~ /^([A-Z_][A-Z0-9_]*)[\.\:-](.+)$/i) # Normally, dash is allowed as part of the namespace prefix. || ($mode eq '.:' && $key =~ /^([A-Z_][A-Z0-9_-]*)[\.\:](.+)$/i)) { $ns = uc($1); $term = lc($2); $nsurl = $this->{_metadata_ns}->{$ns}->{nsurl}; # In eRDF, cope with undefined prefixes by assuming that # the class name was never intended as eRDF. if ($mode eq '-' && !$nsurl) { return $key; } # Otherwise, if an undefined prefix is used, complain quietly. $nsurl = 'http://undefined-namespace-prefix.invalid/' if (!$nsurl); return '<'.$nsurl.$term.'>'; } # If no valid prefix is found return lc($key); } #/sub _pfx2fq sub _fq2pfx { my $this = shift; my $key = shift; my $mode = shift || ':'; $key =~ s/(^\<|\>$)//g; $key =~ /^(.*[\#\/])([^\#\/]+)$/; my $nsurl = $1; $key = $2; my $pfx; foreach my $k (keys %{ $this->{_metadata_ns} }) { if ($this->{_metadata_ns}->{$k}->{nsurl} eq $nsurl) { $pfx = lc($k); last; } } return $pfx.$mode.$key; } #/sub _fq2pfx sub abs_url { my $this = shift; my $url = url shift, $this->{BASE}; return $url->abs->as_string; } #/sub abs_url sub parse_uformats { my $this = shift; my (@tmp1, @tmp2, @tmp3, @tmp4); (@tmp1 = Cognition::Parser::hCard::parse_all($this)) && ($this->{uF}->{hCard} = \@tmp1); (@tmp2 = Cognition::Parser::adr::parse_all($this)) && ($this->{uF}->{adr} = \@tmp2); (@tmp3 = Cognition::Parser::geo::parse_all($this)) && ($this->{uF}->{geo} = \@tmp3); (@tmp4 = Cognition::Parser::hCalendarEvent::parse_all($this)) && ($this->{uF}->{hCalendarEvent} = \@tmp4); # hAtom # rel-tag # rel-license + CC license # rel-enclosure } #/sub parse_uformats sub parse_doc_structure { # Create my own pseudo-uF: list of semantic tables (i.e. having <caption> or summary) # Create my own pseudo-uF: list of figures (i.e. <img class="fig" title="blah">) # Create my own pseudo-uF: list of semantic lists (i.e. <ul class="list" title="blah">) # Parse heading structure, including figures, tables and lists} } #/sub parse_doc_structure sub html2xhtml { my $this = shift; my $html = shift; $html =~ s/\<\!DOCTYPE[^>]*\>//i; my $t = HTML::TreeBuilder->new(); $t->implicit_tags(1); $t->p_strict(1); $t->parse_content($html); $t->{'xmlns'} = 'http://www.w3.org/1999/xhtml'; my $xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . "<!DOCTYPE html\n" . "\tPUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" . "\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" . $t->as_XML(undef, "\t", { 'area'=>1, 'base'=>1, 'basefont'=>1, 'br'=>1, 'col'=>1, 'frame'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'link'=>1, 'meta'=>1, 'param'=>1 }); return $xhtml; } #/sub html2xhtml sub searchClass { my $this = shift; my $target = shift; my $dom = shift || $this->{DOM}; my $nodeList = $dom->getElementsByTagName('*'); my @matches = (); for (my $i = 0; $i < $nodeList->getLength; $i++) { my $node = $nodeList->item($i); my $classList = $node->getAttribute('class'); next unless (length $classList); if ($classList =~ / (^|\s) $target (\s|$) /x) { push @matches, $node; } } return @matches; } #/sub searchClass sub searchRel { my $this = shift; my $target = shift; my $dom = shift || $this->{DOM}; my $nodeList = $dom->getElementsByTagName('*'); my @matches = (); for (my $i = 0; $i < $nodeList->getLength; $i++) { my $node = $nodeList->item($i); my $classList = $node->getAttribute('rel'); next unless (length $classList); if ($classList =~ / (^|\s) $target (\s|$) /x) { push @matches, $node; } } return @matches; } #/sub searchRel sub searchID { my $this = shift; my $target = shift; my $dom = shift || $this->{DOM}; my $nodeList = $dom->getElementsByTagName('*'); my @matches = (); for (my $i = 0; $i < $nodeList->getLength; $i++) { my $node = $nodeList->item($i); my $id = $node->getAttribute('id') || next; if ($id eq $target) { return $node; } } } #/sub searchID 1; #/package Cognition::Parser ###################################################################### package Cognition::Parser::Microformats; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.10; } sub STRINGIFY { my $domNode = shift; my $withClass = shift || undef; my $inClass = shift || 0; if (defined $withClass && !$inClass && $domNode->getNodeType == ELEMENT_NODE) { my $c = $domNode->getAttribute('class'); $inClass = 1 if ($c =~ / (^|\s) $withClass (\s|$) /x); } if ($inClass || !defined $withClass) { if ($domNode->getNodeType == TEXT_NODE) { return $domNode->getData; } elsif ($domNode->getNodeType == ELEMENT_NODE && $domNode->getTagName eq 'img') { return $domNode->getAttribute('alt'); } elsif ($domNode->getNodeType == ELEMENT_NODE) { my @parts; foreach my $child ($domNode->getChildNodes) { push @parts, STRINGIFY($child, $withClass, $inClass); } return join(' ', @parts); } } } sub ufp_abbr # Implements the microformats ABBR pattern. { my $node = shift; my $page = shift; my $elem = shift || 'abbr'; my $attr = shift || 'title'; my $rv = 0; my $abbrs = $node->getElementsByTagName($elem); for (my $i=0; $i < $abbrs->getLength; $i++) { # For each ABBR element... my $abbr = $abbrs->item($i); # Skip pattern if no title attribute found. my $title = $abbr->getAttribute($attr) || next; # Implement Andy Mabbett's suggested "data:" prefix if ($title =~ / [\(\[] data\: (.*) [\)\]] /x || $title =~ / data\: (.*) $ /x ) { $title = $1; } # Remove child nodes my $kids = $abbr->getChildNodes; for (my $j=0; $j < $kids->getLength; $j++) { $abbr->removeChild($kids->item($j)); } # Insert contents of title attribute $abbr->addText($title) && $rv++; } # Return number of replacements made. return $rv; } # /sub ufp_abbr sub ufp_include # Implements the standard microformats include pattern. { my $node = shift; my $page = shift; my $class = shift || 'include'; my $rv = 0; my $links = $node->getElementsByTagName('a'); for (my $i=0; $i < $links->getLength; $i++) { # For each link... my $link = $links->item($i); # Skip pattern if no class attribute found. my $classList = $link->getAttribute('class') || next; # We've found a use of the include pattern if ($classList =~ / (^|\s) $class (\s|$) /x) { my $href = $link->getAttribute('href'); my $id = undef; if ($href =~ /^\#(.*)$/) { $id = $1; } else { next; } my $replacement = $page->searchID($id)->cloneNode(1) || next; $link->getParentNode->replaceChild($replacement, $link) && $rv++; } } # Return number of replacements made. return $rv; } # /sub ufp_include sub ufp_include2 # Implements the alternative microformats include pattern. { my $node = shift; my $page = shift; my $classpfx = shift || '#'; my $rv = 0; # For each element... foreach my $elem ($node->getElementsByTagName('*')) { # Skip pattern if no class attribute found. my $classList = $elem->getAttribute('class'); next unless ($classList =~ / $classpfx /x); my $atEnd = 0; $classList =~ s/(^\s|\s$)//g; $classList =~ s/\s+/ /g; my @classes = split / /, $classList; my @newClassList = (); foreach my $c (@classes) { if (substr($c,0,1) ne $classpfx && length($c)>1) { push @newClassList, $c; $atEnd = 1; next; } my $id = $c; $id =~ s/^\#//x; my $replacement = $page->searchID($id) || next; $replacement = $replacement->cloneNode(1); if ($atEnd) { $elem->appendChild($replacement) && $rv++; } else { $elem->insertBefore($replacement, $elem->getFirstChild) && $rv++; } } $elem->setAttribute('class', join(' ', @newClassList)); } # Return number of replacements made. return $rv; } # /sub ufp_include2 1; #/package Cognition::Parser::Microformats ###################################################################### package Cognition::Parser::hCard; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.10; } sub MAX_INCLUDE_ITERATIONS { 6; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('vcard', $within); my @rv; foreach my $card (@nodes) { my $hcard = parse($page, $card); push @rv, $hcard; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $hcard_node = $rv->{'_dom'}->cloneNode(1); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::Parser::Microformats::ufp_include($hcard_node, $page) + Cognition::Parser::Microformats::ufp_include2($hcard_node, $page); $incl_iterations++; } Cognition::Parser::Microformats::ufp_abbr($hcard_node, $page); # Pull out nested hcards to prevent them being parsed as within this hcard. # Check to see if they have an 'agent' class, and if so, parse them as an agent. my @nested = $page->searchClass('vcard', $hcard_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) agent (\s|$) /x) { push @{ $rv->{agent} }, parse($page, $h); } $h->getParentNode->removeChild($h); } # TODO: rel-tag for categories. # Pull out embedded addresses. my @adrs = Cognition::Parser::adr::parse_all($page, $hcard_node); $rv->{adr} = \@adrs unless (!@adrs); # Pull out embedded locations. my @geos = Cognition::Parser::geo::parse_all($page, $hcard_node); $rv->{geo} = \@geos unless (!@geos); # Parse organisations. my @orgs = $page->searchClass('org', $hcard_node); foreach my $org (@orgs) { my @orgnames = $page->searchClass('organization-name', $org); my @orgunits = $page->searchClass('organization-unit', $org); my $parsed_org = {}; if (!@orgnames) { $parsed_org = { 'organisation-unit' => undef, 'organisation-name' => Cognition::Parser::Microformats::STRINGIFY($org) }; } else { my $orgname = $orgnames[0]; my $orgunit = $orgunits[0]; $parsed_org = { 'organisation-unit' => Cognition::Parser::Microformats::STRINGIFY($orgname), 'organisation-name' => Cognition::Parser::Microformats::STRINGIFY($orgunit) }; } push @{ $rv->{org} }, $parsed_org; } # Simple attributes - singles my @simples = qw(fn bday tz sort-string uid class family-name given-name additional-name honorific-prefix honorific-suffix); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $hcard_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::Parser::Microformats::STRINGIFY($value_node); if ($simple eq 'uid') { if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area') { $value = $value_node->getAttribute('href'); } elsif ($value_node->getTagName eq 'img') { $value = $value_node->getAttribute('src'); } elsif ($value_node->getTagName eq 'object') { $value = $value_node->getAttribute('data'); } } $rv->{$simple} = $value; } # Simple attributes - possible duplicates my @simples = qw(nickname url label title role photo sound key logo mailer category note rev agent); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $hcard_node); foreach my $value_node (@value_nodes) { next unless ($value_node); my $value = Cognition::Parser::Microformats::STRINGIFY($value_node); if ($simple =~ /^(url|photo|logo|sound|key)$/) { if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area') { $value = $value_node->getAttribute('href'); } elsif ($value_node->getTagName eq 'img') { $value = $value_node->getAttribute('src'); } elsif ($value_node->getTagName eq 'object') { $value = $value_node->getAttribute('data'); } } push @{$rv->{$simple}}, $value; } } # Telephone numbers and e-mail addresses my @comms = ('tel', 'email'); foreach my $comm (@comms) { my @nodes = $page->searchClass($comm, $hcard_node); foreach my $n (@nodes) { my @accumValue; my @accumType; foreach my $v ($page->searchClass('value', $n)) { if ($v->getTagName eq 'a' || $v->getTagName eq 'area') { my $href = $v->getAttribute('href'); $href =~ s/(^[a-z]+\:|\?.*$)//g; push @accumValue, $href; } elsif ($v) { push @accumValue, Cognition::Parser::Microformats::STRINGIFY($v); } } foreach my $v ($page->searchClass('type', $n)) { push @accumType, uc(Cognition::Parser::Microformats::STRINGIFY($v)); $v->getParentNode->removeChild($v); } my $val = join '', @accumValue; if ($val !~ /[0-9a-z]/i) { $val = Cognition::Parser::Microformats::STRINGIFY($n); } push @{ $rv->{$comm} }, { 'value' => $val, 'type' => \@accumType }; } } # Check to see if this contact card is for an individual contact or an org. # If for an org, disable n-optimisation. my $do_n_opt = 1; $rv->{_type} = 'contact'; foreach my $org (@{ $rv->{org} }) { if (lc($org->{'organisation-name'}) eq lc($rv->{fn})) { $rv->{_type} = 'organisation'; $do_n_opt = 0; last; } } # Perform n-optimisation. if ($do_n_opt) { foreach my $f (qw(family-name given-name additional-name honorific-prefix honorific-suffix)) { if (length $rv->{$f}) { $do_n_opt = 0; } } if ($do_n_opt) { $rv->{fn} =~ s/(^\s|\s$)//g; $rv->{fn} =~ s/\s+/ /g; my @words = split / /, $rv->{fn}; if (scalar @words == 1) { $rv->{nickname} = $words[0]; } elsif (scalar @words) { if ($words[0] =~ /^.*\,$/ || $words[1] =~ /^.\.?$/) { $words[0] =~ s/[\.\,]$//; $words[1] =~ s/[\.\,]$//; $rv->{'given-name'} = $words[1]; $rv->{'family-name'} = $words[0]; } else { $rv->{'given-name'} = $words[0]; $rv->{'family-name'} = $words[-1]; } } } } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::Parser::hCard ###################################################################### package Cognition::Parser::hCalendarEvent; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.10; } sub MAX_INCLUDE_ITERATIONS { 6; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('vevent', $within); my @rv; foreach my $ev (@nodes) { push @rv, parse($page, $ev); } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $event_node = $rv->{'_dom'}->cloneNode(1); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::Parser::Microformats::ufp_include($event_node, $page) + Cognition::Parser::Microformats::ufp_include2($event_node, $page); $incl_iterations++; } Cognition::Parser::Microformats::ufp_abbr($event_node, $page); # Pull out nested hcards to prevent them being parsed as within this event. # Check to see if they have a 'useful' class, and if so, parse them in. my @nested = $page->searchClass('vcard', $event_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) organiser (\s|$) /x) { push @{ $rv->{organiser} }, Cognition::Parser::hCard::parse($page, $h); } if ($h->getAttribute('class') =~ / (^|\s) contact (\s|$) /x) { push @{ $rv->{contact} }, Cognition::Parser::hCard::parse($page, $h); } if ($h->getAttribute('class') =~ / (^|\s) attendee (\s|$) /x) { push @{ $rv->{attendee} }, Cognition::Parser::hCard::parse($page, $h); } if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x) { push @{ $rv->{location} }, Cognition::Parser::hCard::parse($page, $h); } $h->getParentNode->removeChild($h); } # Ditto for addresses my @nested = $page->searchClass('adr', $event_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x) { push @{ $rv->{location} }, Cognition::Parser::adr::parse($page, $h); } $h->getParentNode->removeChild($h); } # Ditto for geo my @nested = $page->searchClass('geo', $event_node); foreach my $h (@nested) { if ($h->getAttribute('class') =~ / (^|\s) location (\s|$) /x) { push @{ $rv->{location} }, Cognition::Parser::geo::parse($page, $h); } else { push @{ $rv->{geo} }, Cognition::Parser::geo::parse($page, $h); } $h->getParentNode->removeChild($h); } # TODO: rel-tag for categories. # Simple attributes - singles my @simples = qw(class description dtend dtstart duration location status summary uid url last-modified rdate rrule); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $event_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::Parser::Microformats::STRINGIFY($value_node); if ($simple =~ /^(url|uid)$/) { if ($value_node->getTagName eq 'a' || $value_node->getTagName eq 'area') { $value = $value_node->getAttribute('href'); } elsif ($value_node->getTagName eq 'img') { $value = $value_node->getAttribute('src'); } elsif ($value_node->getTagName eq 'object') { $value = $value_node->getAttribute('data'); } } $rv->{$simple} = $value; } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::Parser::hCalendarEvent ###################################################################### package Cognition::Parser::adr; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.10; } sub MAX_INCLUDE_ITERATIONS { 2; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('adr', $within); my @rv; foreach my $adr (@nodes) { my $adr_parsed = parse($page, $adr); push @rv, $adr_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $adr_node = $rv->{'_dom'}->cloneNode(1); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::Parser::Microformats::ufp_include($adr_node, $page) + Cognition::Parser::Microformats::ufp_include2($adr_node, $page); $incl_iterations++; } Cognition::Parser::Microformats::ufp_abbr($adr_node, $page); # Simple attributes - possible duplicates my @simples = qw(post-office-box extended-address street-address locality region postal-code country-name type); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $adr_node); foreach my $value_node (@value_nodes) { my $value = Cognition::Parser::Microformats::STRINGIFY($value_node); push @{$rv->{$simple}}, $value; } } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::Parser::adr ###################################################################### package Cognition::Parser::geo; ###################################################################### use XML::DOM; use strict; BEGIN { our $VERSION = 0.10; } sub MAX_INCLUDE_ITERATIONS { 2; } sub parse_all { my $page = shift; my $within = shift || $page->{DOM}; my @nodes = $page->searchClass('geo', $within); my @rv; foreach my $geo (@nodes) { my $geo_parsed = parse($page, $geo); push @rv, $geo_parsed; } return @rv; } #/sub parse_all sub parse { my $page = shift; my $rv = { '_dom'=>shift }; my $geo_node = $rv->{'_dom'}->cloneNode(1); # Expand microformat patterns. my $incl_iterations = 0; my $replacements = 1; while (($incl_iterations < MAX_INCLUDE_ITERATIONS) && $replacements) { $replacements = Cognition::Parser::Microformats::ufp_include($geo_node, $page) + Cognition::Parser::Microformats::ufp_include2($geo_node, $page); $incl_iterations++; } Cognition::Parser::Microformats::ufp_abbr($geo_node, $page); # Simple attributes - singles my @simples = qw(longitude latitude body reference-frame altitude); foreach my $simple (@simples) { my @value_nodes = $page->searchClass($simple, $geo_node); my $value_node = $value_nodes[0]; next unless ($value_node); my $value = Cognition::Parser::Microformats::STRINGIFY($value_node); $rv->{$simple} = $value; } if (!length($rv->{longitude}) || !length($rv->{latitude})) { my $str = $geo_node->toString; if ($geo_node->getTagName eq 'img' || $geo_node->getTagName eq 'area') { $str = $geo_node->getAttribute('alt'); } $str =~ s/\,/\./g; if ($str =~ / \s* (\-?[0-9\.]+) \s* \; \s* (\-?[0-9\.]+) \s* /x) { $rv->{latitude} = $1; $rv->{longitude} = $2; } } bless $rv; return $rv; } #/sub parse 1; #/package Cognition::Parser::adr ###################################################################### package main; ###################################################################### use strict; use Data::Dumper; use HTTP::Request; my $url = shift @ARGV; my $request = HTTP::Request->new(GET => $url); my $ua = LWP::UserAgent->new; my $response = $ua->request($request); my $x = Cognition::Parser->new( $response->content, { ua => $ua, response => $response, request => $request } ); print $x->dumpTextClean;