diff options
Diffstat (limited to 'wiki/templates/Scrubber.pm')
-rw-r--r-- | wiki/templates/Scrubber.pm | 749 |
1 files changed, 749 insertions, 0 deletions
diff --git a/wiki/templates/Scrubber.pm b/wiki/templates/Scrubber.pm new file mode 100644 index 0000000..2efaa10 --- /dev/null +++ b/wiki/templates/Scrubber.pm @@ -0,0 +1,749 @@ +package HTML::Scrubber; + +# ABSTRACT: Perl extension for scrubbing/sanitizing html + + +use 5.008; # enforce minimum perl version of 5.8 +use strict; +use warnings; +use HTML::Parser 3.47 (); +use HTML::Entities; +use Scalar::Util ('weaken'); + +our ( @_scrub, @_scrub_fh ); + +our $VERSION = '0.15'; # VERSION +our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY + +# my my my my, these here to prevent foolishness like +# http://perlmonks.org/index.pl?node_id=251127#Stealing+Lexicals +(@_scrub) = ( \&_scrub, "self, event, tagname, attr, attrseq, text" ); +(@_scrub_fh) = ( \&_scrub_fh, "self, event, tagname, attr, attrseq, text" ); + +sub new { + my $package = shift; + my $p = HTML::Parser->new( + api_version => 3, + default_h => \@_scrub, + marked_sections => 0, + strict_comment => 0, + unbroken_text => 1, + case_sensitive => 0, + boolean_attribute_value => undef, + empty_element_tags => 1, + ); + + my $self = { + _p => $p, + _rules => { '*' => 0, }, + _comment => 0, + _process => 0, + _r => "", + _optimize => 1, + _script => 0, + _style => 0, + }; + + $p->{"\0_s"} = bless $self, $package; + weaken( $p->{"\0_s"} ); + + return $self unless @_; + + my (%args) = @_; + + for my $f (qw[ default allow deny rules process comment ]) { + next unless exists $args{$f}; + if ( ref $args{$f} ) { + $self->$f( @{ $args{$f} } ); + } + else { + $self->$f( $args{$f} ); + } + } + + return $self; +} + + +sub comment { + return $_[0]->{_comment} + if @_ == 1; + $_[0]->{_comment} = $_[1]; + return; +} + + +sub process { + return $_[0]->{_process} + if @_ == 1; + $_[0]->{_process} = $_[1]; + return; +} + + +sub script { + return $_[0]->{_script} + if @_ == 1; + $_[0]->{_script} = $_[1]; + return; +} + + +sub style { + return $_[0]->{_style} + if @_ == 1; + $_[0]->{_style} = $_[1]; + return; +} + + +sub allow { + my $self = shift; + for my $k (@_) { + $self->{_rules}{ lc $k } = 1; + } + $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse + + return; +} + + +sub deny { + my $self = shift; + + for my $k (@_) { + $self->{_rules}{ lc $k } = 0; + } + + $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse + + return; +} + + +sub rules { + my $self = shift; + my (%rules) = @_; + for my $k ( keys %rules ) { + $self->{_rules}{ lc $k } = $rules{$k}; + } + + $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse + + return; +} + + +sub default { + return $_[0]->{_rules}{'*'} + if @_ == 1; + + $_[0]->{_rules}{'*'} = $_[1] if defined $_[1]; + $_[0]->{_rules}{'_'} = $_[2] if defined $_[2] and ref $_[2]; + $_[0]->{_optimize} = 1; # each time a rule changes, reoptimize when parse + + return; +} + + +sub scrub_file { + if ( @_ > 2 ) { + return unless defined $_[0]->_out( $_[2] ); + } + else { + $_[0]->{_p}->handler( default => @_scrub ); + } + + $_[0]->_optimize(); #if $_[0]->{_optimize}; + + $_[0]->{_p}->parse_file( $_[1] ); + + return delete $_[0]->{_r} unless exists $_[0]->{_out}; + print { $_[0]->{_out} } $_[0]->{_r} if length $_[0]->{_r}; + delete $_[0]->{_out}; + return 1; +} + + +sub scrub { + if ( @_ > 2 ) { + return unless defined $_[0]->_out( $_[2] ); + } + else { + $_[0]->{_p}->handler( default => @_scrub ); + } + + $_[0]->_optimize(); # if $_[0]->{_optimize}; + + $_[0]->{_p}->parse( $_[1] ) if defined( $_[1] ); + $_[0]->{_p}->eof(); + + return delete $_[0]->{_r} unless exists $_[0]->{_out}; + delete $_[0]->{_out}; + return 1; +} + + +sub _out { + my ( $self, $o ) = @_; + + unless ( ref $o and ref \$o ne 'GLOB' ) { + open my $F, '>', $o or return; + binmode $F; + $self->{_out} = $F; + } + else { + $self->{_out} = $o; + } + + $self->{_p}->handler( default => @_scrub_fh ); + + return 1; +} + + +sub _validate { + my ( $s, $t, $r, $a, $as ) = @_; + return "<$t>" unless %$a; + + $r = $s->{_rules}->{$r}; + my %f; + + for my $k ( keys %$a ) { + my $check = exists $r->{$k} ? $r->{$k} : exists $r->{'*'} ? $r->{'*'} : next; + + if ( ref $check eq 'CODE' ) { + my @v = $check->( $s, $t, $k, $a->{$k}, $a, \%f ); + next unless @v; + $f{$k} = shift @v; + } + elsif ( ref $check || length($check) > 1 ) { + $f{$k} = $a->{$k} if $a->{$k} =~ m{$check}; + } + elsif ($check) { + $f{$k} = $a->{$k}; + } + } + + if (%f) { + my %seen; + return "<$t $r>" + if $r = join ' ', map { + defined $f{$_} + ? qq[$_="] . encode_entities( $f{$_} ) . q["] + : $_; # boolean attribute (TODO?) + } grep { exists $f{$_} and !$seen{$_}++; } @$as; + } + + return "<$t>"; +} + + +sub _scrub_str { + my ( $p, $e, $t, $a, $as, $text ) = @_; + + my $s = $p->{"\0_s"}; + my $outstr = ''; + + if ( $e eq 'start' ) { + if ( exists $s->{_rules}->{$t} ) # is there a specific rule + { + if ( ref $s->{_rules}->{$t} ) # is it complicated?(not simple;) + { + $outstr .= $s->_validate( $t, $t, $a, $as ); + } + elsif ( $s->{_rules}->{$t} ) # validate using default attribute rule + { + $outstr .= $s->_validate( $t, '_', $a, $as ); + } + } + elsif ( $s->{_rules}->{'*'} ) # default allow tags + { + $outstr .= $s->_validate( $t, '_', $a, $as ); + } + } + elsif ( $e eq 'end' ) { + my $place = 0; + if ( exists $s->{_rules}->{$t} ) { + $place = 1 if $s->{_rules}->{$t}; + } + elsif ( $s->{_rules}->{'*'} ) { + $place = 1; + } + if ($place) { + if ( length $text ) { + $outstr .= "</$t>"; + } + else { + substr $s->{_r}, -1, 0, ' /'; + } + } + } + elsif ( $e eq 'comment' ) { + if ( $s->{_comment} ) { + + # only copy comments through if they are well formed... + $outstr .= $text if ( $text =~ m|^<!--.*-->$|ms ); + } + } + elsif ( $e eq 'process' ) { + $outstr .= $text if $s->{_process}; + } + elsif ( $e eq 'text' or $e eq 'default' ) { + $text =~ s/</</g; #https://rt.cpan.org/Ticket/Attachment/8716/10332/scrubber.patch + $text =~ s/>/>/g; + + $outstr .= $text; + } + elsif ( $e eq 'start_document' ) { + $outstr = ""; + } + + return $outstr; +} + + +sub _scrub_fh { + my $self = $_[0]->{"\0_s"}; + print { $self->{_out} } $self->{'_r'} if length $self->{_r}; + $self->{'_r'} = _scrub_str(@_); +} + + +sub _scrub { + + $_[0]->{"\0_s"}->{_r} .= _scrub_str(@_); +} + +sub _optimize { + my ($self) = @_; + + my (@ignore_elements) = grep { not $self->{"_$_"} } qw(script style); + $self->{_p}->ignore_elements(@ignore_elements); # if @ is empty, we reset ;) + + return unless $self->{_optimize}; + + #sub allow + # return unless $self->{_optimize}; # till I figure it out (huh) + + if ( $self->{_rules}{'*'} ) { # default allow + $self->{_p}->report_tags(); # so clear it + } + else { + + my (@reports) = + grep { # report only tags we want + $self->{_rules}{$_} + } keys %{ $self->{_rules} }; + + $self->{_p}->report_tags( # default deny, so optimize + @reports + ) if @reports; + } + + # sub deny + # return unless $self->{_optimize}; # till I figure it out (huh) + my (@ignores) = + grep { not $self->{_rules}{$_} } grep { $_ ne '*' } keys %{ $self->{_rules} }; + + $self->{_p}->ignore_tags( # always ignore stuff we don't want + @ignores + ) if @ignores; + + $self->{_optimize} = 0; + return; +} + +1; + +#print sprintf q[ '%-12s => %s,], "$_'", $h{$_} for sort keys %h;# perl! +#perl -ne"chomp;print $_;print qq'\t\t# test ', ++$a if /ok\(/;print $/" test.pl >test2.pl +#perl -ne"chomp;print $_;if( /ok\(/ ){s/\#test \d+$//;print qq'\t\t# test ', ++$a }print $/" test.pl >test2.pl +#perl -ne"chomp;if(/ok\(/){s/# test .*$//;print$_,qq'\t\t# test ',++$a}else{print$_}print$/" test.pl >test2.pl + +__END__ + +=pod + +=for stopwords html cpan callback homepage Perlbrew perltidy respository + +=head1 NAME + +HTML::Scrubber - Perl extension for scrubbing/sanitizing html + +=head1 VERSION + +version 0.15 + +=head1 SYNOPSIS + + use HTML::Scrubber; + + my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); + print $scrubber->scrub('<p><b>bold</b> <em>missing</em></p>'); + # output is: <p><b>bold</b> </p> + + # more complex input + my $html = q[ + <style type="text/css"> BAD { background: #666; color: #666;} </style> + <script language="javascript"> alert("Hello, I am EVIL!"); </script> + <HR> + a => <a href=1>link </a> + br => <br> + b => <B> bold </B> + u => <U> UNDERLINE </U> + ]; + + print $scrubber->scrub($html); + + $scrubber->deny( qw[ p b i u hr br ] ); + + print $scrubber->scrub($html); + +=head1 DESCRIPTION + +If you want to "scrub" or "sanitize" html input in a reliable and flexible +fashion, then this module is for you. + +I wasn't satisfied with HTML::Sanitizer because it is based on +HTML::TreeBuilder, so I thought I'd write something similar that works directly +with HTML::Parser. + +=head1 METHODS + +First a note on documentation: just study the L<EXAMPLE|"EXAMPLE"> below. It's +all the documentation you could need + +Also, be sure to read all the comments as well as L<How does it work?|"How does +it work?">. + +If you're new to perl, good luck to you. + +=head2 comment + + warn "comments are ", $p->comment ? 'allowed' : 'not allowed'; + $p->comment(0); # off by default + +=head2 process + + warn "process instructions are ", $p->process ? 'allowed' : 'not allowed'; + $p->process(0); # off by default + +=head2 script + + warn "script tags (and everything in between) are supressed" + if $p->script; # off by default + $p->script( 0 || 1 ); + +B<**> Please note that this is implemented using HTML::Parser's ignore_elements +function, so if C<script> is set to true, all script tags encountered will be +validated like all other tags. + +=head2 style + + warn "style tags (and everything in between) are supressed" + if $p->style; # off by default + $p->style( 0 || 1 ); + +B<**> Please note that this is implemented using HTML::Parser's ignore_elements +function, so if C<style> is set to true, all style tags encountered will be +validated like all other tags. + +=head2 allow + + $p->allow(qw[ t a g s ]); + +=head2 deny + + $p->deny(qw[ t a g s ]); + +=head2 rules + + $p->rules( + img => { + src => qr{^(?!http://)}i, # only relative image links allowed + alt => 1, # alt attribute allowed + '*' => 0, # deny all other attributes + }, + a => { + href => sub { ... }, # check or adjust with a callback + }, + b => 1, + ... + ); + +Updates set of attribute rules. Each rule can be 1/0, regular expression or a +callback. Values longer than 1 char are treated as regexps. Callback is called +with the following arguments: this object, tag name, attribute name and +attribute value, should return empty list to drop attribute, C<undef> to keep +it without value or a new scalar value. + +=head2 default + + print "default is ", $p->default(); + $p->default(1); # allow tags by default + $p->default( + undef, # don't change + { # default attribute rules + '*' => 1, # allow attributes by default + } + ); + +=head2 scrub_file + + $html = $scrubber->scrub_file('foo.html'); ## returns giant string + die "Eeek $!" unless defined $html; ## opening foo.html may have failed + $scrubber->scrub_file('foo.html', 'new.html') or die "Eeek $!"; + $scrubber->scrub_file('foo.html', *STDOUT) + or die "Eeek $!" + if fileno STDOUT; + +=head2 scrub + + print $scrubber->scrub($html); ## returns giant string + $scrubber->scrub($html, 'new.html') or die "Eeek $!"; + $scrubber->scrub($html', *STDOUT) + or die "Eeek $!" + if fileno STDOUT; + +=for comment _out + $scrubber->_out(*STDOUT) if fileno STDOUT; + $scrubber->_out('foo.html') or die "eeek $!"; + +=for comment _validate +Uses $self->{_rules} to do attribute validation. +Takes tag, rule('_' || $tag), attrref. + +=for comment _scrub_str + +I<default> handler, used by both _scrub and _scrub_fh Moved all the common code +(basically all of it) into a single routine for ease of maintenance + +=for comment _scrub_fh + +I<default> handler, does the scrubbing if we're scrubbing out to a file. Now +calls _scrub_str and pushes that out to a file. + +=for comment _scrub + +I<default> handler, does the scrubbing if we're returning a giant string. Now +calls _scrub_str and appends that to the output string. + +=head1 How does it work? + +When a tag is encountered, HTML::Scrubber allows/denies the tag using the +explicit rule if one exists. + +If no explicit rule exists, Scrubber applies the default rule. + +If an explicit rule exists, but it's a simple rule(1), the default attribute +rule is applied. + +=head2 EXAMPLE + +=for example begin + + #!/usr/bin/perl -w + use HTML::Scrubber; + use strict; + + my @allow = qw[ br hr b a ]; + + my @rules = ( + script => 0, + img => { + src => qr{^(?!http://)}i, # only relative image links allowed + alt => 1, # alt attribute allowed + '*' => 0, # deny all other attributes + }, + ); + + my @default = ( + 0 => # default rule, deny all tags + { + '*' => 1, # default rule, allow all attributes + 'href' => qr{^(?:http|https|ftp)://}i, + 'src' => qr{^(?:http|https|ftp)://}i, + + # If your perl doesn't have qr + # just use a string with length greater than 1 + 'cite' => '(?i-xsm:^(?:http|https|ftp):)', + 'language' => 0, + 'name' => 1, # could be sneaky, but hey ;) + 'onblur' => 0, + 'onchange' => 0, + 'onclick' => 0, + 'ondblclick' => 0, + 'onerror' => 0, + 'onfocus' => 0, + 'onkeydown' => 0, + 'onkeypress' => 0, + 'onkeyup' => 0, + 'onload' => 0, + 'onmousedown' => 0, + 'onmousemove' => 0, + 'onmouseout' => 0, + 'onmouseover' => 0, + 'onmouseup' => 0, + 'onreset' => 0, + 'onselect' => 0, + 'onsubmit' => 0, + 'onunload' => 0, + 'src' => 0, + 'type' => 0, + } + ); + + my $scrubber = HTML::Scrubber->new(); + $scrubber->allow(@allow); + $scrubber->rules(@rules); # key/value pairs + $scrubber->default(@default); + $scrubber->comment(1); # 1 allow, 0 deny + + ## preferred way to create the same object + $scrubber = HTML::Scrubber->new( + allow => \@allow, + rules => \@rules, + default => \@default, + comment => 1, + process => 0, + ); + + require Data::Dumper, die Data::Dumper::Dumper($scrubber) if @ARGV; + + my $it = q[ + <?php echo(" EVIL EVIL EVIL "); ?> <!-- asdf --> + <hr> + <I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br> + <B> IN BOLD </B><br> + <A NAME="evil"> + <A HREF="javascript:alert('die die die');">HREF=JAVA <!></A> + <br> + <A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die');"> + <IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCRIPT"> + </A> + </A> <br> + ]; + + print "#original text", $/, $it, $/; + print + "#scrubbed text (default ", $scrubber->default(), # no arguments returns the current value + " comment ", $scrubber->comment(), " process ", $scrubber->process(), " )", $/, $scrubber->scrub($it), $/; + + $scrubber->default(1); # allow all tags by default + $scrubber->comment(0); # deny comments + + print + "#scrubbed text (default ", + $scrubber->default(), + " comment ", + $scrubber->comment(), + " process ", + $scrubber->process(), + " )", $/, + $scrubber->scrub($it), + $/; + + $scrubber->process(1); # allow process instructions (dangerous) + $default[0] = 1; # allow all tags by default + $default[1]->{'*'} = 0; # deny all attributes by default + $scrubber->default(@default); # set the default again + + print + "#scrubbed text (default ", + $scrubber->default(), + " comment ", + $scrubber->comment(), + " process ", + $scrubber->process(), + " )", $/, + $scrubber->scrub($it), + $/; + +=for example end + +=head2 FUN + +If you have Test::Inline (and you've installed HTML::Scrubber), try + + pod2test Scrubber.pm >scrubber.t + perl scrubber.t + +=head1 SEE ALSO + +L<HTML::Parser>, L<Test::Inline>. + +The C<HTML::Sanitizer> module is no longer available on CPAN. + +=head1 VERSION REQUIREMENTS + +As of version 0.14 I have added a perl minimum version requirement of 5.8. This +is basically due to failures on the smokers perl 5.6 installations - which +appears to be down to installation mechanisms and requirements. + +Since I don't want to spend the time supporting a version that is so old (and +may not work for reasons on UTF support etc), I have added a C<use 5.008;> to +the main module. + +If this is problematic I am very willing to accept patches to fix this up, +although I do not personally see a good reason to support a release that has +been obsolete for 13 years. + +=head1 CONTRIBUTING + +If you want to contribute to the development of this module, the code is on +L<GitHub|http://github.com/nigelm/html-scrubber>. You'll need a perl +environment with L<Dist::Zilla>, and if you're just getting started, there's +some documentation on using Vagrant and Perlbrew +L<here|http://mrcaron.github.io/2015/03/06/Perl-CPAN-Pull-Request.html>. + +There is now a C<.perltidyrc> and a <.tidyallrc> file within the respository +for the standard perltidy settings used - I will apply these before new +releases. Please do not let formatting prevent you from sending in patches etc +- this can be sorted out as part of the release process. Info on C<tidyall> +can be found at +L<https://metacpan.org/pod/distribution/Code-TidyAll/bin/tidyall>. + +=head1 INSTALLATION + +See perlmodinstall for information and options on installing Perl modules. + +=head1 BUGS AND LIMITATIONS + +You can make new bug reports, and view existing ones, through the +web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Scrubber>. + +=head1 AVAILABILITY + +The project homepage is L<https://metacpan.org/release/HTML-Scrubber>. + +The latest version of this module is available from the Comprehensive Perl +Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN +site near you, or see L<https://metacpan.org/module/HTML::Scrubber/>. + +=head1 AUTHORS + +=over 4 + +=item * + +Ruslan Zakirov <Ruslan.Zakirov@gmail.com> + +=item * + +Nigel Metheringham <nigelm@cpan.org> + +=item * + +D. H. <podmaster@cpan.org> + +=back + +=head1 COPYRIGHT AND LICENSE + +This software is copyright (c) 2015 by Ruslan Zakirov, Nigel Metheringham, 2003-2004 D. H.. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=cut |