#!/usr/bin/perl

# JSON::PP is far too slow!
BEGIN { $ENV{PERL_JSON_BACKEND} = 'JSON::XS'; }

use Getopt::Long;
use HTML::Entities qw(encode_entities_numeric);
use JSON;
use strict;
use utf8;

my $sdbroot  = $ENV{SDBROOT};
my $initfile = "$sdbroot/sdb.ttl";
my $syntax   = 'nquad';
my $prefixes = {};
my $expand = {
	'http://purl.org/vocab/bio/0.1/' => 'bio' ,
	'http://creativecommons.org/ns#' => 'cc' ,
	'http://web.resource.org/cc/' => 'ccold' ,
	'http://creativecommons.org/ns#' => 'ccrel' ,
	'http://purl.org/dc/terms/' => 'dc' ,
	'http://purl.org/dc/elements/1.1/' => 'dc11' ,
	'http://ramonantonio.net/doac/0.1/#' => 'doac' ,
	'http://usefulinc.com/ns/doap#' => 'doap' ,
	'http://xmlns.com/foaf/0.1/' => 'foaf' ,
	'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo' , 
	'http://www.w3.org/2002/12/cal/icaltzd#' => 'ical' ,
	'http://laconi.ca/ont/' => 'lac' ,
	'http://ontologi.es/like#' => 'like' ,
	'http://www.w3.org/2000/10/swap/log#' => 'log' ,
	'http://purl.org/ontology/mo/' => 'mo' ,
	'http://open.vocab.org/terms/' => 'ov' ,
	'http://www.w3.org/2002/07/owl#' => 'owl' ,
	'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf' ,
	'http://www.w3.org/2004/03/trix/rdfg-1/' => 'rdfg' ,
	'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs' ,
	'http://purl.org/vocab/relationship/' => 'rel' ,
	'http://purl.org/stuff/rev#' => 'rev' ,
	'http://purl.org/rss/1.0/' => 'rss' ,
	'http://rdfs.org/sioc/ns#' => 'sioc' ,
	'http://www.w3.org/2004/02/skos/core#' => 'skos' ,
	'http://www.holygoat.co.uk/owl/redwood/0.1/tags/' => 'tags' ,
	'http://www.w3.org/2006/vcard/ns#' => 'vcard' ,
	'http://xmlns.com/wot/0.1/' => 'wot' ,
	'http://vocab.sindice.com/xfn#' => 'xfn' ,
	'http://www.w3.org/1999/xhtml/vocab#' => 'xhv' ,
	'http://www.w3.org/2001/XMLSchema#' => 'xsd' ,
	};

GetOptions(
	'sdb=s'      => \$initfile ,
	'syntax=s'   => \$syntax ,
);

my $outfile = shift @ARGV || die "Please specify output file name.\n";

# Note that sdb2load.pl only imports JSON, NQuads and TriX. (N3 and TriG are not imported.)
die "Unrecognised syntax. Valid values are 'json', 'nquad', 'n3', 'trig' and 'trix'.\n"
	unless $syntax =~ /^(json|nq|nquads?|trix|trig|n3)$/i;

print STDERR "Querying triple store for data.\n";
my $jsondata;
open JSONDATA, "-|:encoding(UTF-8)", "$sdbroot/bin/sdbquery --results=JSON --sdb=$initfile 'SELECT ?g ?s ?p ?o WHERE { GRAPH ?g {?s ?p ?o}}'";
$jsondata = <JSONDATA>;
$jsondata = '' if $jsondata =~ /Warning: SDB_JDBC not set/; # first line might be rubbish!
$jsondata .= $_ while <JSONDATA>;
close JSONDATA;

open OUTFILE, ">$outfile";

if ($syntax =~ /^json$/i)
{
	print STDERR "Outputting data in JSON syntax.\n";
	print OUTFILE $jsondata;
}
else
{
	print STDERR "Parsing JSON data.\n";
	my $data = from_json($jsondata);
	$data = $data->{'results'}->{'bindings'}; # the sparql->results->bindings shuffle.
	
	if ($syntax =~ /^(nq|nquads?)$/i)
	{
		print STDERR "Outputting data in N-Quads syntax.\n";
		foreach my $binding (@$data)
		{
			print OUTFILE sprintf("%s %s %s %s .\n",
				n3_token($binding->{'g'}, 'nq', 'g'),
				n3_token($binding->{'s'}, 'nq', 's'),
				n3_token($binding->{'p'}, 'nq', 'p'),
				n3_token($binding->{'o'}, 'nq', 'o'));
		}
	}
	else
	{
		print STDERR "Sorting parsed data.\n";
		my @sorted = sort { $a->{'g'}->{'value'} cmp $b->{'g'}->{'value'} } @$data;
		
		if ($syntax =~ /trix/i)
		{
			my $current_graph = undef;
			
			print STDERR "Outputting data in TriX syntax.\n";
			
			print OUTFILE "<TriX xmlns=\"http://www.w3.org/2004/03/trix/trix-1/\">\n";
			
			foreach my $binding (@sorted)
			{
				if ($binding->{'g'}->{'value'} ne $current_graph)
				{
					if (defined $current_graph)
					{
						print OUTFILE "\t</graph>\n";
					}
					print OUTFILE "\t<graph>\n";
					print OUTFILE "\t\t" . trix_token($binding->{'g'}) . "\n";
					
					$current_graph = $binding->{'g'}->{'value'};
				}

				print OUTFILE sprintf("\t\t<triple>%s%s%s</triple>\n",
					trix_token($binding->{'s'}),
					trix_token($binding->{'p'}),
					trix_token($binding->{'o'}));
			}
			
			if (defined $current_graph)
			{
				print OUTFILE "\t</graph>\n";
			}

			print OUTFILE "</TriX>\n";
		}
		elsif ($syntax =~ /^(trig|n3)$/i)
		{
			my $rv;
			my $current_graph = undef;
			
			print STDERR "Outputting data in TriG/N3 syntax.\n";
			foreach my $binding (@sorted)
			{
				if ($binding->{'g'}->{'value'} ne $current_graph)
				{
					if (defined $current_graph)
					{
						if ($syntax =~ /trig/)
							{ $rv .= "}\n\n"; }
						else
							{ $rv .= "} .\n\n"; }
					}
					
					if ($syntax =~ /trig/)
					{
						$rv .= sprintf("%s {\n",
							n3_token($binding->{'g'}, lc $syntax, 'g'));
					}
					else
					{
						$rv .= sprintf("%s = {\n",
							n3_token($binding->{'g'}, lc $syntax, 'g'));
					}
					
					$current_graph = $binding->{'g'}->{'value'};
				}

				$rv.= sprintf("\t%s %s %s .\n",
					n3_token($binding->{'s'}, lc $syntax, 's'),
					n3_token($binding->{'p'}, lc $syntax, 'p'),
					n3_token($binding->{'o'}, lc $syntax, 'o'));
			}
			
			if (defined $current_graph)
			{
				if ($syntax =~ /trig/)
					{ $rv .= "}\n\n"; }
				else
					{ $rv .= "} .\n\n"; }
			}
			
			$rv = &prefixes . $rv;
			
			print OUTFILE $rv;
		}
	}
}

close OUTFILE;

sub trix_token
{
	my $tok      = shift;

	if ($tok->{'type'} eq 'uri')
	{
		return sprintf('<uri>%s</uri>',
			encode_entities_numeric($tok->{'value'}));
	}
	elsif ($tok->{'type'} eq 'bnode')
	{
		return sprintf('<id>%s</id>',
			encode_entities_numeric($tok->{'value'}));
	}
	elsif ($tok->{'type'} eq 'literal' && defined $tok->{'datatype'})
	{
		return sprintf('<typedLiteral datatype="%s">%s</typedLiteral>',
			encode_entities_numeric($tok->{'datatype'}),
			encode_entities_numeric($tok->{'value'}));
	}
	elsif ($tok->{'type'} eq 'literal' && defined $tok->{'xml:lang'})
	{
		return sprintf('<plainLiteral xml:lang="%s">%s</plainLiteral>',
			encode_entities_numeric($tok->{'xml:lang'}),
			encode_entities_numeric($tok->{'value'}));
	}
	else
	{
		return sprintf('<plainLiteral>%s</plainLiteral>',
			encode_entities_numeric($tok->{'value'}));
	}
	
	return "<error />";
}

sub n3_token
{
	my $tok      = shift;
	my $dialect  = shift || 'nq';
	my $position = shift || 'x';
	
	if ($dialect =~ /^(n3|trig|ttl)$/ && $tok->{'type'} eq 'uri')
	{
		my $curie = get_curie($tok->{'value'});
		return $curie if defined $curie;
	}
	
	return sprintf('<%s>', $tok->{'value'})
		if $tok->{'type'} eq 'uri';

	return sprintf('_:%s', $tok->{'value'})
		if $tok->{'type'} eq 'bnode';

	my $value = $tok->{'value'};
	$value =~ s/\\/\\\\/g;
	$value =~ s/\"/\\\"/g;
	$value =~ s/\r/\\\r/g;
	$value =~ s/\n/\\\n/g;
	$value =~ s/\t/\\\t/g;
	$value =~ s/([^\x20-\x7F])/sprintf('\u%04x', ord $1)/eg;
	
	$value = "\"$value\"";
	if ($tok->{'xml:lang'})
	{
		$value .= sprintf('@%s', $tok->{'xml:lang'});
	}
	elsif ($tok->{'datatype'})
	{
		my $dtcurie;
		$dtcurie = get_curie($tok->{'datatype'})
			if ($dialect =~ /^(n3|trig|ttl)$/);
		
		if (defined $dtcurie)
			{ $value .= sprintf('^^%s', $dtcurie); }
		else
			{ $value .= sprintf('^^<%s>', $tok->{'datatype'}); }
	}
	
	return $value;
}

sub get_curie
{
	my $uri = shift;
	
	return 'a' if $uri eq 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
	
	if ($uri =~ m!^(.+[#/])([A-Za-z0-9_\.-]*)$!)
	{
		if (defined $expand->{$1})
		{
			$prefixes->{ $expand->{$1} } = $1;
			return $expand->{$1} . ':' . $2;
		}
	}
	
	return undef;
}

sub prefixes
{
	my $rv;
	foreach my $p (sort keys %$prefixes)
	{
		$rv .= sprintf("\@prefix %-8s <%s> .\n",
			$p.':',
			$prefixes->{$p});
	}
	return "$rv\n";
}