#!/usr/bin/perl use CSS; use CSS::Parse::PRDGrammar; use Digest::SHA1 qw(sha1_hex); use XML::LibXML; use Data::Dumper; use strict; my $css = <
Toby's website.
Joe Bloggs
EOF ; my $parser = XML::LibXML->new(); my $DOM = $parser->parse_string($html); apply_sheet_to_dom($DOM, $css); print $DOM->documentElement->toStringC14N(1) . "\n"; sub apply_sheet_to_dom { my $DOM = shift; my $sheet = shift; my $CSS = parse_css($sheet); my $kwijibo = 'kwijibo'; foreach my $block (@{$CSS->{data}}) { my @nodes = $DOM->findnodes($block->{xpath}); foreach my $node (@nodes) { foreach my $rule (@{$block->{properties}}) { apply_rule_to_node($node, $rule->{property}, $rule->{value}, $CSS->{prefixes}, $kwijibo, $DOM); } } } foreach my $node ($DOM->getElementsByTagName('*')) { foreach my $prop (qw(about content)) { if (defined $node->getAttribute('x-rdf-'.$prop)) { $node->setAttribute($prop, $node->getAttribute('x-rdf-'.$prop)) if (!defined $node->getAttribute($prop)); $node->removeAttribute('x-rdf-'.$prop); } } foreach my $prop (qw(typeof rel rev property datatype role)) { if ($node->getAttribute('x-rdf-'.$prop)) { if ($node->getAttribute($prop)) { $node->setAttribute($prop, $node->getAttribute($prop).' '. $node->getAttribute('x-rdf-'.$prop)); } else { $node->setAttribute($prop, $node->getAttribute('x-rdf-'.$prop)); } $node->removeAttribute('x-rdf-'.$prop); } } } } sub apply_rule_to_node { my $node = shift; my $prop = lc(shift); my $vals = shift; my $pfxs = shift; my $kwij = shift; my $doc = shift; my @vals = split_values($vals, $pfxs); if ($prop =~ /^\-rdf\-(.*)$/) { $prop = $1; } else { return; } if ($prop =~ /^(typeof|rel|rev|property|datatype|role)$/) { if (grep {/^reset$/} @vals) { $node->setAttribute('x-rdf-'.$prop, undef); } my $new = $node->getAttribute('x-rdf-'.$prop); $new .= ' ' if ($new); foreach my $v (@vals) { $new .= "$v " unless ($v eq 'reset'); } $new =~ s/ $//; $node->setAttribute('x-rdf-'.$prop, $new); } elsif ($prop eq 'about') { foreach my $v (@vals) { if (lc($v) eq 'reset') { $node->removeAttribute('x-rdf-about'); } elsif (lc($v) eq 'document') { $node->setAttribute('x-rdf-about', '') unless (defined $node->getAttribute($prop)); } elsif ($v =~ /^NEAR:\s+(.+)$/) { my @matched = $doc->documentElement->findnodes(_css_to_xpath($1)); my $best_match; foreach my $matching_node (@matched) { if (substr($node->nodePath, 0, length($matching_node->nodePath)) eq $matching_node->nodePath) { $best_match = $matching_node if ((!$best_match) || (length($matching_node->nodePath) > length($best_match->nodePath))); } } if ($best_match) { $node->setAttribute('x-rdf-about', sprintf('[_:%s_%s]', $kwij, sha1_hex($best_match->nodePath))); } } } } elsif ($prop eq 'content') { if ($vals =~ /attr\([\'\"]?(.+)[\'\"]?\)/i) { $node->setAttribute('x-rdf-content', $node->getAttribute($1)); } } } sub split_values { my $vals = shift; my $pfxs = shift; my @rv; return @rv if ($vals =~ /^ \s* normal \s* $/i); while (length $vals) { if ($vals =~ /^ \s* (reset|document) \s* (.*) $/x) { push @rv, $1; $vals = $2; } elsif ($vals =~ /^ \s* url\(\s*\'([^\']*)\'\s*\) \s* (.*) $/ix) { push @rv, $1; $vals = $2; } elsif ($vals =~ /^ \s* url\(\s*\"([^\"]*)\"\s*\) \s* (.*) $/ix) { push @rv, $1; $vals = $2; } elsif ($vals =~ /^ \s* url\(\s*([^\"\'\)]*)\s*\) \s* (.*) $/ix) { push @rv, $1; $vals = $2; } elsif ($vals =~ /^ \s* nearest\-ancestor\(\s*\'([^\']*)\'\s*\) \s* (.*) $/ix) { push @rv, "NEAR: $1"; $vals = $2; } elsif ($vals =~ /^ \s* nearest\-ancestor\(\s*\"([^\"]*)\"\s*\) \s* (.*) $/ix) { push @rv, "NEAR: $1"; $vals = $2; } elsif ($vals =~ /^ \s* nearest\-ancestor\(\s*([^\"\'\)]*)\s*\) \s* (.*) $/ix) { push @rv, "NEAR: $1"; $vals = $2; } elsif ($vals =~ /^ \s* \'([^\'\:]*)\:([^\']*)\' \s* (.*) $/ix) { push @rv, $pfxs->{$1}.$2; $vals = $3; } elsif ($vals =~ /^ \s* \"([^\"\:]*)\:([^\"]*)\" \s* (.*) $/ix) { push @rv, $pfxs->{$1}.$2; $vals = $3; } elsif ($vals =~ /^ \s* ([^\"\'\:\s]*)\:([^\"\'\s]*) \s* (.*) $/ix) { push @rv, $pfxs->{$1}.$2; $vals = $3; } else { my @null; return @null; } } return @rv; } sub parse_css { my @data; my ($prefixes, $i) = ({ 'dc' => 'http://purl.org/dc/terms/', 'foaf' => 'http://xmlns.com/foaf/0.1/', 'owl' => 'http://www.w3.org/2002/07/owl#', 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#', 'sioc' => 'http://rdfs.org/sioc/ns#', 'skos' => 'http://www.w3.org/2004/02/skos/core#', 'xsd' => 'http://www.w3.org/2001/XMLSchema#' }, 0); # Fix CSS::Parse::Heavy because it doesn't support CSS properties that # start with an underscore. $CSS::Parse::PRDGrammar::GRAMMAR =~ s#macro_nmstart:\s+/\[a-zA-Z\]/ #macro_nmstart: /[a-zA-Z_-]/ #x; my $parser = CSS->new( { 'parser' => 'CSS::Parse::Heavy' } )->read_string($css); foreach my $block (@$parser) { foreach my $selector (@{ $block->{selectors} }) { if ($selector->{name} eq '_') { foreach my $property (@{ $block->{properties} }) { my $prefix = $property->{options}->{property}; my $url = $property->{options}->{value}; $url = $1 if ($url =~ /url\([\'\"]?([^\'\"]+)[\'\"]?\)/i); $prefixes->{$prefix} = $url; } next; } my $x = {}; foreach my $property (@{ $block->{properties} }) { push @{ $x->{properties} }, $property->{options}; } $x->{selector} = $selector->{name}; $x->{order} = ++$i; $x->{tokens} = _tokenize_selector($x->{selector}); $x->{specificity} = _specificity(@{ $x->{tokens} }); $x->{xpath} = _css_to_xpath(@{ $x->{tokens} }); push @data, $x; } } my @sorted = sort _cascade @data; return { prefixes => $prefixes, data => \@sorted }; } sub _specificity { return ( _count_ids(@_) * 1000000 + _count_attrs(@_) * 1000 + _count_elems(@_) + 0 ); } sub _tokenize_selector { my $selector = shift; my @rv; while (length $selector) { if ($selector =~ /^ \s* ([\>\+]) \s* (.*) $/x) { push @rv, $1; $selector = $2; } elsif ($selector =~ /^ (\s+) (.*) $/x) { push @rv, ' '; $selector = $2; } elsif ($selector =~ /^ ([^\s\>\+]+) ([\s\>\+].*) $/x) { push @rv, $1; $selector = $2; } else { push @rv, $selector; $selector = ''; } } return \@rv; } sub _count_ids { return scalar grep { /\#/ } @_; } sub _count_attrs { return scalar grep { /[\.[]/ } @_; } sub _count_elems { return scalar grep { /^[a-z]/i } @_; } sub _cascade { return ($a->{order} <=> $b->{order}) if ($a->{specificity} == $b->{specificity}); return ($a->{specificity} <=> $b->{specificity}); } sub _css_to_xpath { return '//'.__css_to_xpath(\@_); } sub _bitty { my $str = shift; my @rv; if ($str =~ /^ ([a-z0-9\*]+) (.*) $/ix) { push @rv, $1; $str = $2; } while (length $str) { if ($str =~ /^ (\[[^\]]*\]) (.*) $/ix) { push @rv, $1; $str = $2; } elsif ($str =~ /^ (\:[a-z-]+\([a-z_-]*\)) (.*) $/ix) { push @rv, $1; $str = $2; } elsif ($str =~ /^ (\.[a-z0-9_-]+) (.*) $/ix) { push @rv, $1; $str = $2; } elsif ($str =~ /^ (\#[a-z0-9_-]+) (.*) $/ix) { push @rv, $1; $str = $2; } } return @rv; } sub __css_to_xpath { my $toks = shift; my @tokens = @{$toks}; my $self = shift; my $next = 0; my $t = shift @tokens || return ''; my $rv = ''; # Make $t always start with the tag name. $t = "*$t" if ($t =~ /^[\.\#\:]/); if ($t eq '>') { $rv = '/'; } elsif ($t eq ' ') { $rv = '//'; } elsif ($t eq '+') { $rv = '/following-sibling::*[1]/'; $next = 1; } else { my @bits = _bitty($t); foreach my $bit (@bits) { if ($bit =~ /^ \. (.*) $/ix) { $rv .= "[contains(concat(\" \",\@class,\" \"),concat(\" \",\"$1\",\" \"))]"; } elsif ($bit =~ /^ \# (.*) $/ix) { $rv .= "[\@id=\"$1\"]"; } elsif ($bit =~ /^ \[ \s* (.*) \s* \~\= \s* [\"\']?(.*)[\"\']? \s* \] $/ix) { $rv .= "[contains(concat(\" \",\@$1,\" \"),concat(\" \",\"$2\",\" \"))]"; } elsif ($bit =~ /^ \[ \s* (.*) \s* \|\= \s* [\"\']?(.*)[\"\']? \s* \] $/ix) { $rv .= "[\@$1=\"$2\" or starts-with(\@$1,concat(\"$2\",\"-\"))]"; } elsif ($bit =~ /^ \[ \s* (.*) \s* \= \s* [\"\']?(.*)[\"\']? \s* \] $/ix) { $rv .= "[\@$1=\"$2\"]"; } elsif ($bit =~ /^ \[ \s* (.*) \s* \] $/ix) { $rv .= "[\@$1]"; } elsif (lc($bit) eq ':first-child') { $rv = "*[1]/self::$rv"; } elsif ($bit =~ /^ \[ \s* \:lang\((.*)\) \s* \] $/ix) { $rv .= "[\@lang=\"$1\" or starts-with(\@lang,concat(\"$1\",\"-\"))]"; } else { $rv .= $bit; } } } return 'self::'.$rv.__css_to_xpath(\@tokens, $next) if ($self); return $rv.__css_to_xpath(\@tokens, $next); }