#!/usr/bin/perl
use CSS;
use CSS::Parse::PRDGrammar;
use Digest::SHA1 qw(sha1_hex);
use XML::LibXML;
use Data::Dumper;
use strict;
my $css = <<EOF
_ {
foaf: url("http://xmlns.com/foaf/0.1/");
xsd: url("http://www.w3.org/2001/XMLSchema#");
}
.person
{
-rdf-typeof: "foaf:Person";
-rdf-about: reset nearest-ancestor(".person");
}
.person .name
{
-rdf-property: "foaf:name" url("http://purl.org/dc/terms/title");
-rdf-content: attr(title);
-rdf-datatype: "xsd:string";
-rdf-about: reset nearest-ancestor(".person");
}
.person a[href]
{
-rdf-rel: "foaf:page";
-rdf-about: reset nearest-ancestor(".person");
}
EOF
;
my $html = <<EOF
<html>
<div class="person">
<span class="name" title="Toby Inkster">Toby's</span>
<a href="http://tobyinkster.co.uk">website</a>.
<div class="person">
<span class="name">Joe Bloggs</span>
</div>
</div>
</html>
EOF
;
my $parser = XML::LibXML->new();
my $DOM = $parser->parse_string($html);
apply_sheet_to_dom($DOM, $css);
print $DOM->documentElement->toStringC14N(1) . "\n";
sub apply_sheet_to_dom
{
my $DOM = shift;
my $sheet = shift;
my $CSS = parse_css($sheet);
my $kwijibo = 'kwijibo';
foreach my $block (@{$CSS->{data}})
{
my @nodes = $DOM->findnodes($block->{xpath});
foreach my $node (@nodes)
{
foreach my $rule (@{$block->{properties}})
{
apply_rule_to_node($node, $rule->{property}, $rule->{value}, $CSS->{prefixes}, $kwijibo, $DOM);
}
}
}
foreach my $node ($DOM->getElementsByTagName('*'))
{
foreach my $prop (qw(about content))
{
if (defined $node->getAttribute('x-rdf-'.$prop))
{
$node->setAttribute($prop, $node->getAttribute('x-rdf-'.$prop))
if (!defined $node->getAttribute($prop));
$node->removeAttribute('x-rdf-'.$prop);
}
}
foreach my $prop (qw(typeof rel rev property datatype role))
{
if ($node->getAttribute('x-rdf-'.$prop))
{
if ($node->getAttribute($prop))
{
$node->setAttribute($prop,
$node->getAttribute($prop).' '.
$node->getAttribute('x-rdf-'.$prop));
}
else
{
$node->setAttribute($prop,
$node->getAttribute('x-rdf-'.$prop));
}
$node->removeAttribute('x-rdf-'.$prop);
}
}
}
}
sub apply_rule_to_node
{
my $node = shift;
my $prop = lc(shift);
my $vals = shift;
my $pfxs = shift;
my $kwij = shift;
my $doc = shift;
my @vals = split_values($vals, $pfxs);
if ($prop =~ /^\-rdf\-(.*)$/)
{
$prop = $1;
}
else
{
return;
}
if ($prop =~ /^(typeof|rel|rev|property|datatype|role)$/)
{
if (grep {/^reset$/} @vals)
{
$node->setAttribute('x-rdf-'.$prop, undef);
}
my $new = $node->getAttribute('x-rdf-'.$prop);
$new .= ' ' if ($new);
foreach my $v (@vals)
{
$new .= "$v " unless ($v eq 'reset');
}
$new =~ s/ $//;
$node->setAttribute('x-rdf-'.$prop, $new);
}
elsif ($prop eq 'about')
{
foreach my $v (@vals)
{
if (lc($v) eq 'reset')
{
$node->removeAttribute('x-rdf-about');
}
elsif (lc($v) eq 'document')
{
$node->setAttribute('x-rdf-about', '')
unless (defined $node->getAttribute($prop));
}
elsif ($v =~ /^NEAR:\s+(.+)$/)
{
my @matched = $doc->documentElement->findnodes(_css_to_xpath($1));
my $best_match;
foreach my $matching_node (@matched)
{
if (substr($node->nodePath, 0, length($matching_node->nodePath)) eq $matching_node->nodePath)
{
$best_match = $matching_node
if ((!$best_match)
|| (length($matching_node->nodePath) > length($best_match->nodePath)));
}
}
if ($best_match)
{
$node->setAttribute('x-rdf-about',
sprintf('[_:%s_%s]', $kwij, sha1_hex($best_match->nodePath)));
}
}
}
}
elsif ($prop eq 'content')
{
if ($vals =~ /attr\([\'\"]?(.+)[\'\"]?\)/i)
{
$node->setAttribute('x-rdf-content',
$node->getAttribute($1));
}
}
}
sub split_values
{
my $vals = shift;
my $pfxs = shift;
my @rv;
return @rv
if ($vals =~ /^ \s* normal \s* $/i);
while (length $vals)
{
if ($vals =~ /^ \s* (reset|document) \s* (.*) $/x)
{
push @rv, $1;
$vals = $2;
}
elsif ($vals =~ /^ \s* url\(\s*\'([^\']*)\'\s*\) \s* (.*) $/ix)
{
push @rv, $1;
$vals = $2;
}
elsif ($vals =~ /^ \s* url\(\s*\"([^\"]*)\"\s*\) \s* (.*) $/ix)
{
push @rv, $1;
$vals = $2;
}
elsif ($vals =~ /^ \s* url\(\s*([^\"\'\)]*)\s*\) \s* (.*) $/ix)
{
push @rv, $1;
$vals = $2;
}
elsif ($vals =~ /^ \s* nearest\-ancestor\(\s*\'([^\']*)\'\s*\) \s* (.*) $/ix)
{
push @rv, "NEAR: $1";
$vals = $2;
}
elsif ($vals =~ /^ \s* nearest\-ancestor\(\s*\"([^\"]*)\"\s*\) \s* (.*) $/ix)
{
push @rv, "NEAR: $1";
$vals = $2;
}
elsif ($vals =~ /^ \s* nearest\-ancestor\(\s*([^\"\'\)]*)\s*\) \s* (.*) $/ix)
{
push @rv, "NEAR: $1";
$vals = $2;
}
elsif ($vals =~ /^ \s* \'([^\'\:]*)\:([^\']*)\' \s* (.*) $/ix)
{
push @rv, $pfxs->{$1}.$2;
$vals = $3;
}
elsif ($vals =~ /^ \s* \"([^\"\:]*)\:([^\"]*)\" \s* (.*) $/ix)
{
push @rv, $pfxs->{$1}.$2;
$vals = $3;
}
elsif ($vals =~ /^ \s* ([^\"\'\:\s]*)\:([^\"\'\s]*) \s* (.*) $/ix)
{
push @rv, $pfxs->{$1}.$2;
$vals = $3;
}
else
{
my @null;
return @null;
}
}
return @rv;
}
sub parse_css
{
my @data;
my ($prefixes, $i) = ({
'dc' => 'http://purl.org/dc/terms/',
'foaf' => 'http://xmlns.com/foaf/0.1/',
'owl' => 'http://www.w3.org/2002/07/owl#',
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
'sioc' => 'http://rdfs.org/sioc/ns#',
'skos' => 'http://www.w3.org/2004/02/skos/core#',
'xsd' => 'http://www.w3.org/2001/XMLSchema#'
}, 0);
# Fix CSS::Parse::Heavy because it doesn't support CSS properties that
# start with an underscore.
$CSS::Parse::PRDGrammar::GRAMMAR =~ s#macro_nmstart:\s+/\[a-zA-Z\]/
#macro_nmstart: /[a-zA-Z_-]/
#x;
my $parser = CSS->new( { 'parser' => 'CSS::Parse::Heavy' } )->read_string($css);
foreach my $block (@$parser)
{
foreach my $selector (@{ $block->{selectors} })
{
if ($selector->{name} eq '_')
{
foreach my $property (@{ $block->{properties} })
{
my $prefix = $property->{options}->{property};
my $url = $property->{options}->{value};
$url = $1 if ($url =~ /url\([\'\"]?([^\'\"]+)[\'\"]?\)/i);
$prefixes->{$prefix} = $url;
}
next;
}
my $x = {};
foreach my $property (@{ $block->{properties} })
{
push @{ $x->{properties} }, $property->{options};
}
$x->{selector} = $selector->{name};
$x->{order} = ++$i;
$x->{tokens} = _tokenize_selector($x->{selector});
$x->{specificity} = _specificity(@{ $x->{tokens} });
$x->{xpath} = _css_to_xpath(@{ $x->{tokens} });
push @data, $x;
}
}
my @sorted = sort _cascade @data;
return {
prefixes => $prefixes,
data => \@sorted
};
}
sub _specificity
{
return (
_count_ids(@_) * 1000000 +
_count_attrs(@_) * 1000 +
_count_elems(@_) +
0
);
}
sub _tokenize_selector
{
my $selector = shift;
my @rv;
while (length $selector)
{
if ($selector =~ /^ \s* ([\>\+]) \s* (.*) $/x)
{
push @rv, $1;
$selector = $2;
}
elsif ($selector =~ /^ (\s+) (.*) $/x)
{
push @rv, ' ';
$selector = $2;
}
elsif ($selector =~ /^ ([^\s\>\+]+) ([\s\>\+].*) $/x)
{
push @rv, $1;
$selector = $2;
}
else
{
push @rv, $selector;
$selector = '';
}
}
return \@rv;
}
sub _count_ids
{
return scalar grep { /\#/ } @_;
}
sub _count_attrs
{
return scalar grep { /[\.[]/ } @_;
}
sub _count_elems
{
return scalar grep { /^[a-z]/i } @_;
}
sub _cascade
{
return ($a->{order} <=> $b->{order})
if ($a->{specificity} == $b->{specificity});
return ($a->{specificity} <=> $b->{specificity});
}
sub _css_to_xpath
{
return '//'.__css_to_xpath(\@_);
}
sub _bitty
{
my $str = shift;
my @rv;
if ($str =~ /^ ([a-z0-9\*]+) (.*) $/ix)
{
push @rv, $1;
$str = $2;
}
while (length $str)
{
if ($str =~ /^ (\[[^\]]*\]) (.*) $/ix)
{
push @rv, $1;
$str = $2;
}
elsif ($str =~ /^ (\:[a-z-]+\([a-z_-]*\)) (.*) $/ix)
{
push @rv, $1;
$str = $2;
}
elsif ($str =~ /^ (\.[a-z0-9_-]+) (.*) $/ix)
{
push @rv, $1;
$str = $2;
}
elsif ($str =~ /^ (\#[a-z0-9_-]+) (.*) $/ix)
{
push @rv, $1;
$str = $2;
}
}
return @rv;
}
sub __css_to_xpath
{
my $toks = shift;
my @tokens = @{$toks};
my $self = shift;
my $next = 0;
my $t = shift @tokens || return '';
my $rv = '';
# Make $t always start with the tag name.
$t = "*$t"
if ($t =~ /^[\.\#\:]/);
if ($t eq '>')
{ $rv = '/'; }
elsif ($t eq ' ')
{ $rv = '//'; }
elsif ($t eq '+')
{ $rv = '/following-sibling::*[1]/'; $next = 1; }
else
{
my @bits = _bitty($t);
foreach my $bit (@bits)
{
if ($bit =~ /^ \. (.*) $/ix)
{ $rv .= "[contains(concat(\" \",\@class,\" \"),concat(\" \",\"$1\",\" \"))]"; }
elsif ($bit =~ /^ \# (.*) $/ix)
{ $rv .= "[\@id=\"$1\"]"; }
elsif ($bit =~ /^ \[ \s* (.*) \s* \~\= \s* [\"\']?(.*)[\"\']? \s* \] $/ix)
{ $rv .= "[contains(concat(\" \",\@$1,\" \"),concat(\" \",\"$2\",\" \"))]"; }
elsif ($bit =~ /^ \[ \s* (.*) \s* \|\= \s* [\"\']?(.*)[\"\']? \s* \] $/ix)
{ $rv .= "[\@$1=\"$2\" or starts-with(\@$1,concat(\"$2\",\"-\"))]"; }
elsif ($bit =~ /^ \[ \s* (.*) \s* \= \s* [\"\']?(.*)[\"\']? \s* \] $/ix)
{ $rv .= "[\@$1=\"$2\"]"; }
elsif ($bit =~ /^ \[ \s* (.*) \s* \] $/ix)
{ $rv .= "[\@$1]"; }
elsif (lc($bit) eq ':first-child')
{ $rv = "*[1]/self::$rv"; }
elsif ($bit =~ /^ \[ \s* \:lang\((.*)\) \s* \] $/ix)
{ $rv .= "[\@lang=\"$1\" or starts-with(\@lang,concat(\"$1\",\"-\"))]"; }
else
{ $rv .= $bit; }
}
}
return 'self::'.$rv.__css_to_xpath(\@tokens, $next)
if ($self);
return $rv.__css_to_xpath(\@tokens, $next);
}