#!/usr/bin/perl use common::sense; use utf8; use CGI; use Data::Dumper; use Encode; use HTML::Entities (); # Fix CGI.pm's no-existant unicode handling. my $query = CGI->new; my $form_input = {}; foreach my $name ($query->param) { my @val = $query->param($name); foreach ( @val ) { $_ = Encode::decode_utf8( $_ ); } $name = Encode::decode_utf8( $name ); $form_input->{$name} = ( scalar @val == 1 ) ? $val[0] : \@val; } my @option_names = ( ['control', 'Unusual control characters', 1], ['tab', 'Tab', 0], ['feed', 'Carriage return', 0], ['spacer', 'Unusual space characters', 1], ['invisible', 'Invisible characters', 1], ['nonbmp', 'Unicode Non-BMP characters', 1], ['nonascii', 'All non-ASCII characters', 0], ); my $debug; my $input = $form_input->{'text'}; my $replacement = $form_input->{'replacement'}; my $submitted = (defined $input) || (defined $replacement); my $options = {}; my $h_options = ''; foreach my $o (@option_names) { my ($o_code, $o_name, $o_isdefault) = @$o; my $o_value = $form_input->{'opt_'.$o_code}; $options->{ $o_code } = 1 if $o_value; $h_options .= sprintf( "\t\t\t\t\t\t

\n" . "\t\t\t\t\t\t\t\n" . "\t\t\t\t\t\t\t\n" . "\t\t\t\t\t\t

\n", HTML::Entities::encode_numeric($o_code), HTML::Entities::encode_numeric($o_code), ( ($submitted ? $o_value : $o_isdefault) ? 'checked' : ''), HTML::Entities::encode_numeric($o_code), HTML::Entities::encode_numeric($o_name)); } my $output = Gremlins::zap($input, $replacement, $options); my $h_output = HTML::Entities::encode_numeric($output, '<&>'); my $h_selected = { $replacement => 'selected' }; print < Zap the Gremlins!

Zap the Gremlins

Paste text below to strip out any nasty characters.



Options


$h_options
EOF package Gremlins; sub zap { my $text = shift; my $repl = shift || ''; my $opt = shift; my $re; unless (defined $opt) { $opt = { 'control' => 1, 'nonascii' => 0, 'nonbmp' => 1, 'spacer' => 1, 'invisible'=> 1, 'feed' => 0, 'tab' => 0, }; } if ($opt->{'control'}) { # ASCII control characters except \t \r \n $re .= '\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}'; if (! $opt->{'nonascii'}) { # ISO 6429 control characters $re .= '\x{0080}-\x{009F}'; # Unicode line breaks $re .= '\x{2028}\x{2029}'; # Unicode language tags $re .= '\x{E0000}-\x{E007F}'; # Unicode Ruby $re .= '\x{FFF9}\x{FFFA}\x{FFFB}'; # Unicode BiDi $re .= '\x{200E}\x{200F}\x{202A}-\x{202E}'; # Unicode illegal character $re .= '\x{FFFF}'; } } if ($opt->{'nonascii'}) { $re .= '\x{0080}-\x{FFFFFFFF}'; } else { if ($opt->{'nonbmp'}) { $re .= '\x{10000}-\x{FFFFFFFF}'; } if ($opt->{'spacer'}) { $re .= '\x{00A0}\x{2000}-\x{200B}\x{202F}\x{205F}\x{3000}\x{FEFF}'; } elsif ($opt->{'invisible'}) { $re .= '\x{200C}-\x{200D}\x{2060}-\x{2064}'; } } if ($opt->{'feed'}) { $re .= '\r'; } if ($opt->{'tab'}) { $re .= '\t'; } $re = "([$re])"; return $text if ($re eq '([])'); if ($repl eq 'U+') { $text =~ s/$re/sprintf((ord($1)>0xFFFF?"[U+%08X]":"[U+%04X]"),ord($1))/eg; } elsif ($repl eq '\u') { $text =~ s/$re/sprintf((ord($1)>0xFFFF?"\U%08X":"\u%04X"),ord($1))/eg; } elsif ($repl eq '&#x') { $text =~ s/$re/sprintf("&#x%X;",ord($1))/eg; } elsif ($repl eq '&#') { $text =~ s/$re/sprintf("&#%d;",ord($1))/eg; } else { $text =~ s/$re/$repl/g; } return $text; } 1;