#!/usr/bin/perl

use common::sense;
use utf8;

use CGI;
use Data::Dumper;
use Encode;
use HTML::Entities ();

# Fix CGI.pm's no-existant unicode handling.
my $query = CGI->new;
my $form_input = {};  
foreach my $name ($query->param)
{
	my @val = $query->param($name);
	foreach ( @val )
	{
		$_ = Encode::decode_utf8( $_ );
	}
	$name = Encode::decode_utf8( $name );
	$form_input->{$name} = ( scalar @val == 1 ) ? $val[0] : \@val;
}

my @option_names = (
	['control',   'Unusual control characters',    1],
	['tab',       'Tab',                           0],
	['feed',      'Carriage return',               0],
	['spacer',    'Unusual space characters',      1],
	['invisible', 'Invisible characters',          1],
	['nonbmp',    'Unicode Non-BMP characters',    1],
	['nonascii',  'All non-ASCII characters',      0],
	);

my $debug;
my $input       = $form_input->{'text'};
my $replacement = $form_input->{'replacement'};
my $submitted   = (defined $input) || (defined $replacement);
my $options     = {};
my $h_options   = '';
foreach my $o (@option_names)
{
	my ($o_code, $o_name, $o_isdefault) = @$o;
	
	my $o_value = $form_input->{'opt_'.$o_code};
	$options->{ $o_code } = 1 if $o_value;
	
	$h_options .= sprintf(
		  "\t\t\t\t\t\t<p>\n"
		. "\t\t\t\t\t\t\t<input type=\"checkbox\" id=\"opt_%s\" name=\"opt_%s\" value=\"1\" %s>\n"
		. "\t\t\t\t\t\t\t<label for=\"opt_%s\">%s</label>\n"
		. "\t\t\t\t\t\t</p>\n",
		HTML::Entities::encode_numeric($o_code),
		HTML::Entities::encode_numeric($o_code),
		( ($submitted ? $o_value : $o_isdefault) ? 'checked' : ''),
		HTML::Entities::encode_numeric($o_code),
		HTML::Entities::encode_numeric($o_name));
}

my $output = Gremlins::zap($input, $replacement, $options);

my $h_output   = HTML::Entities::encode_numeric($output, '<&>');
my $h_selected = { $replacement => 'selected' };

print <<EOF;
Content-Type: text/html; charset=utf-8

<!doctype html system "http://doctype.be/402">
<html>
	<head profile="http://www.w3.org/1999/xhtml/vocab">
		<title>Zap the Gremlins!</title>
		<link rel="stylesheet" href="gremlins.css" type="text/css">
	</head>
	<body>
		<h1>Zap the Gremlins</h1>
		<p class="explain">Paste text below to strip out any nasty characters.</p>
		<form method="post" action="">
			<div>
				<div id="main">
					<label for="text">Text:</label><br>
					<textarea id="text" name="text" rows="8" cols="60"
					>$h_output</textarea><br>
					<input value="Zap" type="submit">
				</div>
				<div id="options">
					<h2>Options</h2>
					<p>
						<label for="replacement">Replacement:</label><br>
						<select name="replacement" id="replacement">
							<option $h_selected->{''} value=""></option>
							<option $h_selected->{'U+'} value="U+">U+12AB</option>
							<option $h_selected->{'\\u'} value="\\u">\\u12AB</option>
							<option $h_selected->{'&#'} value="&amp;#">&amp;#4779;</option>
							<option $h_selected->{'&#x'} value="&amp;#x">&amp;#x12AB;</option>
							<option $h_selected->{'•'} value="&#x2022">&#x2022</option>
							<option $h_selected->{'?'} value="?">?</option>
						</select>
					</p>
$h_options
				</div>
			</div>
		</form>
	</body>
</html>
EOF

package Gremlins;

sub zap
{
	my $text = shift;
	my $repl = shift || '';
	my $opt  = shift;
	my $re;
	
	unless (defined $opt)
	{
		$opt = {
			'control'  => 1,
			'nonascii' => 0,
			'nonbmp'   => 1,
			'spacer'   => 1,
			'invisible'=> 1,
			'feed'     => 0,
			'tab'      => 0,
			};
	}		

	if ($opt->{'control'})
	{
		# ASCII control characters except \t \r \n
		$re .= '\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}';
		
		if (! $opt->{'nonascii'})
		{
			# ISO 6429 control characters
			$re .= '\x{0080}-\x{009F}';

			# Unicode line breaks
			$re .= '\x{2028}\x{2029}';
			
			# Unicode language tags
			$re .= '\x{E0000}-\x{E007F}';
			
			# Unicode Ruby
			$re .= '\x{FFF9}\x{FFFA}\x{FFFB}';
			
			# Unicode BiDi
			$re .= '\x{200E}\x{200F}\x{202A}-\x{202E}';
			
			# Unicode illegal character
			$re .= '\x{FFFF}';
		}
	}

	if ($opt->{'nonascii'})
	{
		$re .= '\x{0080}-\x{FFFFFFFF}';
	}
	else
	{
		if ($opt->{'nonbmp'})
		{
			$re .= '\x{10000}-\x{FFFFFFFF}';
		}
		
		if ($opt->{'spacer'})
		{
			$re .= '\x{00A0}\x{2000}-\x{200B}\x{202F}\x{205F}\x{3000}\x{FEFF}';
		}
		elsif ($opt->{'invisible'})
		{
			$re .= '\x{200C}-\x{200D}\x{2060}-\x{2064}';
		}
	}

	if ($opt->{'feed'})
	{
		$re .= '\r';
	}

	if ($opt->{'tab'})
	{
		$re .= '\t';
	}

	$re = "([$re])";

	return $text if ($re eq '([])');

	if ($repl eq 'U+')
	{
		$text =~ s/$re/sprintf((ord($1)>0xFFFF?"[U+%08X]":"[U+%04X]"),ord($1))/eg;
	}
	elsif ($repl eq '\u')
	{
		$text =~ s/$re/sprintf((ord($1)>0xFFFF?"\U%08X":"\u%04X"),ord($1))/eg;
	}
	elsif ($repl eq '&#x')
	{
		$text =~ s/$re/sprintf("&#x%X;",ord($1))/eg;
	}
	elsif ($repl eq '&#')
	{
		$text =~ s/$re/sprintf("&#%d;",ord($1))/eg;
	}
	else
	{
		$text =~ s/$re/$repl/g;
	}
	
	return $text;
}

1;