#!/usr/bin/perl # JSON::PP is far too slow! BEGIN { $ENV{PERL_JSON_BACKEND} = 'JSON::XS'; } use Getopt::Long; use HTML::Entities qw(encode_entities_numeric); use JSON; use strict; use utf8; my $sdbroot = $ENV{SDBROOT}; my $initfile = "$sdbroot/sdb.ttl"; my $syntax = 'nquad'; my $prefixes = {}; my $expand = { 'http://purl.org/vocab/bio/0.1/' => 'bio' , 'http://creativecommons.org/ns#' => 'cc' , 'http://web.resource.org/cc/' => 'ccold' , 'http://creativecommons.org/ns#' => 'ccrel' , 'http://purl.org/dc/terms/' => 'dc' , 'http://purl.org/dc/elements/1.1/' => 'dc11' , 'http://ramonantonio.net/doac/0.1/#' => 'doac' , 'http://usefulinc.com/ns/doap#' => 'doap' , 'http://xmlns.com/foaf/0.1/' => 'foaf' , 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo' , 'http://www.w3.org/2002/12/cal/icaltzd#' => 'ical' , 'http://laconi.ca/ont/' => 'lac' , 'http://ontologi.es/like#' => 'like' , 'http://www.w3.org/2000/10/swap/log#' => 'log' , 'http://purl.org/ontology/mo/' => 'mo' , 'http://open.vocab.org/terms/' => 'ov' , 'http://www.w3.org/2002/07/owl#' => 'owl' , 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf' , 'http://www.w3.org/2004/03/trix/rdfg-1/' => 'rdfg' , 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs' , 'http://purl.org/vocab/relationship/' => 'rel' , 'http://purl.org/stuff/rev#' => 'rev' , 'http://purl.org/rss/1.0/' => 'rss' , 'http://rdfs.org/sioc/ns#' => 'sioc' , 'http://www.w3.org/2004/02/skos/core#' => 'skos' , 'http://www.holygoat.co.uk/owl/redwood/0.1/tags/' => 'tags' , 'http://www.w3.org/2006/vcard/ns#' => 'vcard' , 'http://xmlns.com/wot/0.1/' => 'wot' , 'http://vocab.sindice.com/xfn#' => 'xfn' , 'http://www.w3.org/1999/xhtml/vocab#' => 'xhv' , 'http://www.w3.org/2001/XMLSchema#' => 'xsd' , }; GetOptions( 'sdb=s' => \$initfile , 'syntax=s' => \$syntax , ); my $outfile = shift @ARGV || die "Please specify output file name.\n"; # Note that sdb2load.pl only imports JSON, NQuads and TriX. (N3 and TriG are not imported.) die "Unrecognised syntax. Valid values are 'json', 'nquad', 'n3', 'trig' and 'trix'.\n" unless $syntax =~ /^(json|nq|nquads?|trix|trig|n3)$/i; print STDERR "Querying triple store for data.\n"; my $jsondata; open JSONDATA, "-|:encoding(UTF-8)", "$sdbroot/bin/sdbquery --results=JSON --sdb=$initfile 'SELECT ?g ?s ?p ?o WHERE { GRAPH ?g {?s ?p ?o}}'"; $jsondata = ; $jsondata = '' if $jsondata =~ /Warning: SDB_JDBC not set/; # first line might be rubbish! $jsondata .= $_ while ; close JSONDATA; open OUTFILE, ">$outfile"; if ($syntax =~ /^json$/i) { print STDERR "Outputting data in JSON syntax.\n"; print OUTFILE $jsondata; } else { print STDERR "Parsing JSON data.\n"; my $data = from_json($jsondata); $data = $data->{'results'}->{'bindings'}; # the sparql->results->bindings shuffle. if ($syntax =~ /^(nq|nquads?)$/i) { print STDERR "Outputting data in N-Quads syntax.\n"; foreach my $binding (@$data) { print OUTFILE sprintf("%s %s %s %s .\n", n3_token($binding->{'g'}, 'nq', 'g'), n3_token($binding->{'s'}, 'nq', 's'), n3_token($binding->{'p'}, 'nq', 'p'), n3_token($binding->{'o'}, 'nq', 'o')); } } else { print STDERR "Sorting parsed data.\n"; my @sorted = sort { $a->{'g'}->{'value'} cmp $b->{'g'}->{'value'} } @$data; if ($syntax =~ /trix/i) { my $current_graph = undef; print STDERR "Outputting data in TriX syntax.\n"; print OUTFILE "\n"; foreach my $binding (@sorted) { if ($binding->{'g'}->{'value'} ne $current_graph) { if (defined $current_graph) { print OUTFILE "\t\n"; } print OUTFILE "\t\n"; print OUTFILE "\t\t" . trix_token($binding->{'g'}) . "\n"; $current_graph = $binding->{'g'}->{'value'}; } print OUTFILE sprintf("\t\t%s%s%s\n", trix_token($binding->{'s'}), trix_token($binding->{'p'}), trix_token($binding->{'o'})); } if (defined $current_graph) { print OUTFILE "\t\n"; } print OUTFILE "\n"; } elsif ($syntax =~ /^(trig|n3)$/i) { my $rv; my $current_graph = undef; print STDERR "Outputting data in TriG/N3 syntax.\n"; foreach my $binding (@sorted) { if ($binding->{'g'}->{'value'} ne $current_graph) { if (defined $current_graph) { if ($syntax =~ /trig/) { $rv .= "}\n\n"; } else { $rv .= "} .\n\n"; } } if ($syntax =~ /trig/) { $rv .= sprintf("%s {\n", n3_token($binding->{'g'}, lc $syntax, 'g')); } else { $rv .= sprintf("%s = {\n", n3_token($binding->{'g'}, lc $syntax, 'g')); } $current_graph = $binding->{'g'}->{'value'}; } $rv.= sprintf("\t%s %s %s .\n", n3_token($binding->{'s'}, lc $syntax, 's'), n3_token($binding->{'p'}, lc $syntax, 'p'), n3_token($binding->{'o'}, lc $syntax, 'o')); } if (defined $current_graph) { if ($syntax =~ /trig/) { $rv .= "}\n\n"; } else { $rv .= "} .\n\n"; } } $rv = &prefixes . $rv; print OUTFILE $rv; } } } close OUTFILE; sub trix_token { my $tok = shift; if ($tok->{'type'} eq 'uri') { return sprintf('%s', encode_entities_numeric($tok->{'value'})); } elsif ($tok->{'type'} eq 'bnode') { return sprintf('%s', encode_entities_numeric($tok->{'value'})); } elsif ($tok->{'type'} eq 'literal' && defined $tok->{'datatype'}) { return sprintf('%s', encode_entities_numeric($tok->{'datatype'}), encode_entities_numeric($tok->{'value'})); } elsif ($tok->{'type'} eq 'literal' && defined $tok->{'xml:lang'}) { return sprintf('%s', encode_entities_numeric($tok->{'xml:lang'}), encode_entities_numeric($tok->{'value'})); } else { return sprintf('%s', encode_entities_numeric($tok->{'value'})); } return ""; } sub n3_token { my $tok = shift; my $dialect = shift || 'nq'; my $position = shift || 'x'; if ($dialect =~ /^(n3|trig|ttl)$/ && $tok->{'type'} eq 'uri') { my $curie = get_curie($tok->{'value'}); return $curie if defined $curie; } return sprintf('<%s>', $tok->{'value'}) if $tok->{'type'} eq 'uri'; return sprintf('_:%s', $tok->{'value'}) if $tok->{'type'} eq 'bnode'; my $value = $tok->{'value'}; $value =~ s/\\/\\\\/g; $value =~ s/\"/\\\"/g; $value =~ s/\r/\\\r/g; $value =~ s/\n/\\\n/g; $value =~ s/\t/\\\t/g; $value =~ s/([^\x20-\x7F])/sprintf('\u%04x', ord $1)/eg; $value = "\"$value\""; if ($tok->{'xml:lang'}) { $value .= sprintf('@%s', $tok->{'xml:lang'}); } elsif ($tok->{'datatype'}) { my $dtcurie; $dtcurie = get_curie($tok->{'datatype'}) if ($dialect =~ /^(n3|trig|ttl)$/); if (defined $dtcurie) { $value .= sprintf('^^%s', $dtcurie); } else { $value .= sprintf('^^<%s>', $tok->{'datatype'}); } } return $value; } sub get_curie { my $uri = shift; return 'a' if $uri eq 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; if ($uri =~ m!^(.+[#/])([A-Za-z0-9_\.-]*)$!) { if (defined $expand->{$1}) { $prefixes->{ $expand->{$1} } = $1; return $expand->{$1} . ':' . $2; } } return undef; } sub prefixes { my $rv; foreach my $p (sort keys %$prefixes) { $rv .= sprintf("\@prefix %-8s <%s> .\n", $p.':', $prefixes->{$p}); } return "$rv\n"; }