summaryrefslogtreecommitdiffstats
path: root/roles/wiki/templates/Scrubber.pm
diff options
context:
space:
mode:
Diffstat (limited to 'roles/wiki/templates/Scrubber.pm')
-rw-r--r--roles/wiki/templates/Scrubber.pm749
1 files changed, 749 insertions, 0 deletions
diff --git a/roles/wiki/templates/Scrubber.pm b/roles/wiki/templates/Scrubber.pm
new file mode 100644
index 0000000..2efaa10
--- /dev/null
+++ b/roles/wiki/templates/Scrubber.pm
@@ -0,0 +1,749 @@
+package HTML::Scrubber;
+
+# ABSTRACT: Perl extension for scrubbing/sanitizing html
+
+
+use 5.008; # enforce minimum perl version of 5.8
+use strict;
+use warnings;
+use HTML::Parser 3.47 ();
+use HTML::Entities;
+use Scalar::Util ('weaken');
+
+our ( @_scrub, @_scrub_fh );
+
+our $VERSION = '0.15'; # VERSION
+our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY
+
+# my my my my, these here to prevent foolishness like
+# http://perlmonks.org/index.pl?node_id=251127#Stealing+Lexicals
+(@_scrub) = ( \&_scrub, "self, event, tagname, attr, attrseq, text" );
+(@_scrub_fh) = ( \&_scrub_fh, "self, event, tagname, attr, attrseq, text" );
+
+sub new {
+ my $package = shift;
+ my $p = HTML::Parser->new(
+ api_version => 3,
+ default_h => \@_scrub,
+ marked_sections => 0,
+ strict_comment => 0,
+ unbroken_text => 1,
+ case_sensitive => 0,
+ boolean_attribute_value => undef,
+ empty_element_tags => 1,
+ );
+
+ my $self = {
+ _p => $p,
+ _rules => { '*' => 0, },
+ _comment => 0,
+ _process => 0,
+ _r => "",
+ _optimize => 1,
+ _script => 0,
+ _style => 0,
+ };
+
+ $p->{"\0_s"} = bless $self, $package;
+ weaken( $p->{"\0_s"} );
+
+ return $self unless @_;
+
+ my (%args) = @_;
+
+ for my $f (qw[ default allow deny rules process comment ]) {
+ next unless exists $args{$f};
+ if ( ref $args{$f} ) {
+ $self->$f( @{ $args{$f} } );
+ }
+ else {
+ $self->$f( $args{$f} );
+ }
+ }
+
+ return $self;
+}
+
+
+sub comment {
+ return $_[0]->{_comment}
+ if @_ == 1;
+ $_[0]->{_comment} = $_[1];
+ return;
+}
+
+
+sub process {
+ return $_[0]->{_process}
+ if @_ == 1;
+ $_[0]->{_process} = $_[1];
+ return;
+}
+
+
+sub script {
+ return $_[0]->{_script}
+ if @_ == 1;
+ $_[0]->{_script} = $_[1];
+ return;
+}
+
+
+sub style {
+ return $_[0]->{_style}
+ if @_ == 1;
+ $_[0]->{_style} = $_[1];
+ return;
+}
+
+
+sub allow {
+ my $self = shift;
+ for my $k (@_) {
+ $self->{_rules}{ lc $k } = 1;
+ }
+ $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse
+
+ return;
+}
+
+
+sub deny {
+ my $self = shift;
+
+ for my $k (@_) {
+ $self->{_rules}{ lc $k } = 0;
+ }
+
+ $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse
+
+ return;
+}
+
+
+sub rules {
+ my $self = shift;
+ my (%rules) = @_;
+ for my $k ( keys %rules ) {
+ $self->{_rules}{ lc $k } = $rules{$k};
+ }
+
+ $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse
+
+ return;
+}
+
+
+sub default {
+ return $_[0]->{_rules}{'*'}
+ if @_ == 1;
+
+ $_[0]->{_rules}{'*'} = $_[1] if defined $_[1];
+ $_[0]->{_rules}{'_'} = $_[2] if defined $_[2] and ref $_[2];
+ $_[0]->{_optimize} = 1; # each time a rule changes, reoptimize when parse
+
+ return;
+}
+
+
+sub scrub_file {
+ if ( @_ > 2 ) {
+ return unless defined $_[0]->_out( $_[2] );
+ }
+ else {
+ $_[0]->{_p}->handler( default => @_scrub );
+ }
+
+ $_[0]->_optimize(); #if $_[0]->{_optimize};
+
+ $_[0]->{_p}->parse_file( $_[1] );
+
+ return delete $_[0]->{_r} unless exists $_[0]->{_out};
+ print { $_[0]->{_out} } $_[0]->{_r} if length $_[0]->{_r};
+ delete $_[0]->{_out};
+ return 1;
+}
+
+
+sub scrub {
+ if ( @_ > 2 ) {
+ return unless defined $_[0]->_out( $_[2] );
+ }
+ else {
+ $_[0]->{_p}->handler( default => @_scrub );
+ }
+
+ $_[0]->_optimize(); # if $_[0]->{_optimize};
+
+ $_[0]->{_p}->parse( $_[1] ) if defined( $_[1] );
+ $_[0]->{_p}->eof();
+
+ return delete $_[0]->{_r} unless exists $_[0]->{_out};
+ delete $_[0]->{_out};
+ return 1;
+}
+
+
+sub _out {
+ my ( $self, $o ) = @_;
+
+ unless ( ref $o and ref \$o ne 'GLOB' ) {
+ open my $F, '>', $o or return;
+ binmode $F;
+ $self->{_out} = $F;
+ }
+ else {
+ $self->{_out} = $o;
+ }
+
+ $self->{_p}->handler( default => @_scrub_fh );
+
+ return 1;
+}
+
+
+sub _validate {
+ my ( $s, $t, $r, $a, $as ) = @_;
+ return "<$t>" unless %$a;
+
+ $r = $s->{_rules}->{$r};
+ my %f;
+
+ for my $k ( keys %$a ) {
+ my $check = exists $r->{$k} ? $r->{$k} : exists $r->{'*'} ? $r->{'*'} : next;
+
+ if ( ref $check eq 'CODE' ) {
+ my @v = $check->( $s, $t, $k, $a->{$k}, $a, \%f );
+ next unless @v;
+ $f{$k} = shift @v;
+ }
+ elsif ( ref $check || length($check) > 1 ) {
+ $f{$k} = $a->{$k} if $a->{$k} =~ m{$check};
+ }
+ elsif ($check) {
+ $f{$k} = $a->{$k};
+ }
+ }
+
+ if (%f) {
+ my %seen;
+ return "<$t $r>"
+ if $r = join ' ', map {
+ defined $f{$_}
+ ? qq[$_="] . encode_entities( $f{$_} ) . q["]
+ : $_; # boolean attribute (TODO?)
+ } grep { exists $f{$_} and !$seen{$_}++; } @$as;
+ }
+
+ return "<$t>";
+}
+
+
+sub _scrub_str {
+ my ( $p, $e, $t, $a, $as, $text ) = @_;
+
+ my $s = $p->{"\0_s"};
+ my $outstr = '';
+
+ if ( $e eq 'start' ) {
+ if ( exists $s->{_rules}->{$t} ) # is there a specific rule
+ {
+ if ( ref $s->{_rules}->{$t} ) # is it complicated?(not simple;)
+ {
+ $outstr .= $s->_validate( $t, $t, $a, $as );
+ }
+ elsif ( $s->{_rules}->{$t} ) # validate using default attribute rule
+ {
+ $outstr .= $s->_validate( $t, '_', $a, $as );
+ }
+ }
+ elsif ( $s->{_rules}->{'*'} ) # default allow tags
+ {
+ $outstr .= $s->_validate( $t, '_', $a, $as );
+ }
+ }
+ elsif ( $e eq 'end' ) {
+ my $place = 0;
+ if ( exists $s->{_rules}->{$t} ) {
+ $place = 1 if $s->{_rules}->{$t};
+ }
+ elsif ( $s->{_rules}->{'*'} ) {
+ $place = 1;
+ }
+ if ($place) {
+ if ( length $text ) {
+ $outstr .= "</$t>";
+ }
+ else {
+ substr $s->{_r}, -1, 0, ' /';
+ }
+ }
+ }
+ elsif ( $e eq 'comment' ) {
+ if ( $s->{_comment} ) {
+
+ # only copy comments through if they are well formed...
+ $outstr .= $text if ( $text =~ m|^<!--.*-->$|ms );
+ }
+ }
+ elsif ( $e eq 'process' ) {
+ $outstr .= $text if $s->{_process};
+ }
+ elsif ( $e eq 'text' or $e eq 'default' ) {
+ $text =~ s/</&lt;/g; #https://rt.cpan.org/Ticket/Attachment/8716/10332/scrubber.patch
+ $text =~ s/>/&gt;/g;
+
+ $outstr .= $text;
+ }
+ elsif ( $e eq 'start_document' ) {
+ $outstr = "";
+ }
+
+ return $outstr;
+}
+
+
+sub _scrub_fh {
+ my $self = $_[0]->{"\0_s"};
+ print { $self->{_out} } $self->{'_r'} if length $self->{_r};
+ $self->{'_r'} = _scrub_str(@_);
+}
+
+
+sub _scrub {
+
+ $_[0]->{"\0_s"}->{_r} .= _scrub_str(@_);
+}
+
+sub _optimize {
+ my ($self) = @_;
+
+ my (@ignore_elements) = grep { not $self->{"_$_"} } qw(script style);
+ $self->{_p}->ignore_elements(@ignore_elements); # if @ is empty, we reset ;)
+
+ return unless $self->{_optimize};
+
+ #sub allow
+ # return unless $self->{_optimize}; # till I figure it out (huh)
+
+ if ( $self->{_rules}{'*'} ) { # default allow
+ $self->{_p}->report_tags(); # so clear it
+ }
+ else {
+
+ my (@reports) =
+ grep { # report only tags we want
+ $self->{_rules}{$_}
+ } keys %{ $self->{_rules} };
+
+ $self->{_p}->report_tags( # default deny, so optimize
+ @reports
+ ) if @reports;
+ }
+
+ # sub deny
+ # return unless $self->{_optimize}; # till I figure it out (huh)
+ my (@ignores) =
+ grep { not $self->{_rules}{$_} } grep { $_ ne '*' } keys %{ $self->{_rules} };
+
+ $self->{_p}->ignore_tags( # always ignore stuff we don't want
+ @ignores
+ ) if @ignores;
+
+ $self->{_optimize} = 0;
+ return;
+}
+
+1;
+
+#print sprintf q[ '%-12s => %s,], "$_'", $h{$_} for sort keys %h;# perl!
+#perl -ne"chomp;print $_;print qq'\t\t# test ', ++$a if /ok\(/;print $/" test.pl >test2.pl
+#perl -ne"chomp;print $_;if( /ok\(/ ){s/\#test \d+$//;print qq'\t\t# test ', ++$a }print $/" test.pl >test2.pl
+#perl -ne"chomp;if(/ok\(/){s/# test .*$//;print$_,qq'\t\t# test ',++$a}else{print$_}print$/" test.pl >test2.pl
+
+__END__
+
+=pod
+
+=for stopwords html cpan callback homepage Perlbrew perltidy respository
+
+=head1 NAME
+
+HTML::Scrubber - Perl extension for scrubbing/sanitizing html
+
+=head1 VERSION
+
+version 0.15
+
+=head1 SYNOPSIS
+
+ use HTML::Scrubber;
+
+ my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] );
+ print $scrubber->scrub('<p><b>bold</b> <em>missing</em></p>');
+ # output is: <p><b>bold</b> </p>
+
+ # more complex input
+ my $html = q[
+ <style type="text/css"> BAD { background: #666; color: #666;} </style>
+ <script language="javascript"> alert("Hello, I am EVIL!"); </script>
+ <HR>
+ a => <a href=1>link </a>
+ br => <br>
+ b => <B> bold </B>
+ u => <U> UNDERLINE </U>
+ ];
+
+ print $scrubber->scrub($html);
+
+ $scrubber->deny( qw[ p b i u hr br ] );
+
+ print $scrubber->scrub($html);
+
+=head1 DESCRIPTION
+
+If you want to "scrub" or "sanitize" html input in a reliable and flexible
+fashion, then this module is for you.
+
+I wasn't satisfied with HTML::Sanitizer because it is based on
+HTML::TreeBuilder, so I thought I'd write something similar that works directly
+with HTML::Parser.
+
+=head1 METHODS
+
+First a note on documentation: just study the L<EXAMPLE|"EXAMPLE"> below. It's
+all the documentation you could need
+
+Also, be sure to read all the comments as well as L<How does it work?|"How does
+it work?">.
+
+If you're new to perl, good luck to you.
+
+=head2 comment
+
+ warn "comments are ", $p->comment ? 'allowed' : 'not allowed';
+ $p->comment(0); # off by default
+
+=head2 process
+
+ warn "process instructions are ", $p->process ? 'allowed' : 'not allowed';
+ $p->process(0); # off by default
+
+=head2 script
+
+ warn "script tags (and everything in between) are supressed"
+ if $p->script; # off by default
+ $p->script( 0 || 1 );
+
+B<**> Please note that this is implemented using HTML::Parser's ignore_elements
+function, so if C<script> is set to true, all script tags encountered will be
+validated like all other tags.
+
+=head2 style
+
+ warn "style tags (and everything in between) are supressed"
+ if $p->style; # off by default
+ $p->style( 0 || 1 );
+
+B<**> Please note that this is implemented using HTML::Parser's ignore_elements
+function, so if C<style> is set to true, all style tags encountered will be
+validated like all other tags.
+
+=head2 allow
+
+ $p->allow(qw[ t a g s ]);
+
+=head2 deny
+
+ $p->deny(qw[ t a g s ]);
+
+=head2 rules
+
+ $p->rules(
+ img => {
+ src => qr{^(?!http://)}i, # only relative image links allowed
+ alt => 1, # alt attribute allowed
+ '*' => 0, # deny all other attributes
+ },
+ a => {
+ href => sub { ... }, # check or adjust with a callback
+ },
+ b => 1,
+ ...
+ );
+
+Updates set of attribute rules. Each rule can be 1/0, regular expression or a
+callback. Values longer than 1 char are treated as regexps. Callback is called
+with the following arguments: this object, tag name, attribute name and
+attribute value, should return empty list to drop attribute, C<undef> to keep
+it without value or a new scalar value.
+
+=head2 default
+
+ print "default is ", $p->default();
+ $p->default(1); # allow tags by default
+ $p->default(
+ undef, # don't change
+ { # default attribute rules
+ '*' => 1, # allow attributes by default
+ }
+ );
+
+=head2 scrub_file
+
+ $html = $scrubber->scrub_file('foo.html'); ## returns giant string
+ die "Eeek $!" unless defined $html; ## opening foo.html may have failed
+ $scrubber->scrub_file('foo.html', 'new.html') or die "Eeek $!";
+ $scrubber->scrub_file('foo.html', *STDOUT)
+ or die "Eeek $!"
+ if fileno STDOUT;
+
+=head2 scrub
+
+ print $scrubber->scrub($html); ## returns giant string
+ $scrubber->scrub($html, 'new.html') or die "Eeek $!";
+ $scrubber->scrub($html', *STDOUT)
+ or die "Eeek $!"
+ if fileno STDOUT;
+
+=for comment _out
+ $scrubber->_out(*STDOUT) if fileno STDOUT;
+ $scrubber->_out('foo.html') or die "eeek $!";
+
+=for comment _validate
+Uses $self->{_rules} to do attribute validation.
+Takes tag, rule('_' || $tag), attrref.
+
+=for comment _scrub_str
+
+I<default> handler, used by both _scrub and _scrub_fh Moved all the common code
+(basically all of it) into a single routine for ease of maintenance
+
+=for comment _scrub_fh
+
+I<default> handler, does the scrubbing if we're scrubbing out to a file. Now
+calls _scrub_str and pushes that out to a file.
+
+=for comment _scrub
+
+I<default> handler, does the scrubbing if we're returning a giant string. Now
+calls _scrub_str and appends that to the output string.
+
+=head1 How does it work?
+
+When a tag is encountered, HTML::Scrubber allows/denies the tag using the
+explicit rule if one exists.
+
+If no explicit rule exists, Scrubber applies the default rule.
+
+If an explicit rule exists, but it's a simple rule(1), the default attribute
+rule is applied.
+
+=head2 EXAMPLE
+
+=for example begin
+
+ #!/usr/bin/perl -w
+ use HTML::Scrubber;
+ use strict;
+
+ my @allow = qw[ br hr b a ];
+
+ my @rules = (
+ script => 0,
+ img => {
+ src => qr{^(?!http://)}i, # only relative image links allowed
+ alt => 1, # alt attribute allowed
+ '*' => 0, # deny all other attributes
+ },
+ );
+
+ my @default = (
+ 0 => # default rule, deny all tags
+ {
+ '*' => 1, # default rule, allow all attributes
+ 'href' => qr{^(?:http|https|ftp)://}i,
+ 'src' => qr{^(?:http|https|ftp)://}i,
+
+ # If your perl doesn't have qr
+ # just use a string with length greater than 1
+ 'cite' => '(?i-xsm:^(?:http|https|ftp):)',
+ 'language' => 0,
+ 'name' => 1, # could be sneaky, but hey ;)
+ 'onblur' => 0,
+ 'onchange' => 0,
+ 'onclick' => 0,
+ 'ondblclick' => 0,
+ 'onerror' => 0,
+ 'onfocus' => 0,
+ 'onkeydown' => 0,
+ 'onkeypress' => 0,
+ 'onkeyup' => 0,
+ 'onload' => 0,
+ 'onmousedown' => 0,
+ 'onmousemove' => 0,
+ 'onmouseout' => 0,
+ 'onmouseover' => 0,
+ 'onmouseup' => 0,
+ 'onreset' => 0,
+ 'onselect' => 0,
+ 'onsubmit' => 0,
+ 'onunload' => 0,
+ 'src' => 0,
+ 'type' => 0,
+ }
+ );
+
+ my $scrubber = HTML::Scrubber->new();
+ $scrubber->allow(@allow);
+ $scrubber->rules(@rules); # key/value pairs
+ $scrubber->default(@default);
+ $scrubber->comment(1); # 1 allow, 0 deny
+
+ ## preferred way to create the same object
+ $scrubber = HTML::Scrubber->new(
+ allow => \@allow,
+ rules => \@rules,
+ default => \@default,
+ comment => 1,
+ process => 0,
+ );
+
+ require Data::Dumper, die Data::Dumper::Dumper($scrubber) if @ARGV;
+
+ my $it = q[
+ <?php echo(" EVIL EVIL EVIL "); ?> <!-- asdf -->
+ <hr>
+ <I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br>
+ <B> IN BOLD </B><br>
+ <A NAME="evil">
+ <A HREF="javascript:alert('die die die');">HREF=JAVA &lt;!&gt;</A>
+ <br>
+ <A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die');">
+ <IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCRIPT">
+ </A>
+ </A> <br>
+ ];
+
+ print "#original text", $/, $it, $/;
+ print
+ "#scrubbed text (default ", $scrubber->default(), # no arguments returns the current value
+ " comment ", $scrubber->comment(), " process ", $scrubber->process(), " )", $/, $scrubber->scrub($it), $/;
+
+ $scrubber->default(1); # allow all tags by default
+ $scrubber->comment(0); # deny comments
+
+ print
+ "#scrubbed text (default ",
+ $scrubber->default(),
+ " comment ",
+ $scrubber->comment(),
+ " process ",
+ $scrubber->process(),
+ " )", $/,
+ $scrubber->scrub($it),
+ $/;
+
+ $scrubber->process(1); # allow process instructions (dangerous)
+ $default[0] = 1; # allow all tags by default
+ $default[1]->{'*'} = 0; # deny all attributes by default
+ $scrubber->default(@default); # set the default again
+
+ print
+ "#scrubbed text (default ",
+ $scrubber->default(),
+ " comment ",
+ $scrubber->comment(),
+ " process ",
+ $scrubber->process(),
+ " )", $/,
+ $scrubber->scrub($it),
+ $/;
+
+=for example end
+
+=head2 FUN
+
+If you have Test::Inline (and you've installed HTML::Scrubber), try
+
+ pod2test Scrubber.pm >scrubber.t
+ perl scrubber.t
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<Test::Inline>.
+
+The C<HTML::Sanitizer> module is no longer available on CPAN.
+
+=head1 VERSION REQUIREMENTS
+
+As of version 0.14 I have added a perl minimum version requirement of 5.8. This
+is basically due to failures on the smokers perl 5.6 installations - which
+appears to be down to installation mechanisms and requirements.
+
+Since I don't want to spend the time supporting a version that is so old (and
+may not work for reasons on UTF support etc), I have added a C<use 5.008;> to
+the main module.
+
+If this is problematic I am very willing to accept patches to fix this up,
+although I do not personally see a good reason to support a release that has
+been obsolete for 13 years.
+
+=head1 CONTRIBUTING
+
+If you want to contribute to the development of this module, the code is on
+L<GitHub|http://github.com/nigelm/html-scrubber>. You'll need a perl
+environment with L<Dist::Zilla>, and if you're just getting started, there's
+some documentation on using Vagrant and Perlbrew
+L<here|http://mrcaron.github.io/2015/03/06/Perl-CPAN-Pull-Request.html>.
+
+There is now a C<.perltidyrc> and a <.tidyallrc> file within the respository
+for the standard perltidy settings used - I will apply these before new
+releases. Please do not let formatting prevent you from sending in patches etc
+- this can be sorted out as part of the release process. Info on C<tidyall>
+can be found at
+L<https://metacpan.org/pod/distribution/Code-TidyAll/bin/tidyall>.
+
+=head1 INSTALLATION
+
+See perlmodinstall for information and options on installing Perl modules.
+
+=head1 BUGS AND LIMITATIONS
+
+You can make new bug reports, and view existing ones, through the
+web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Scrubber>.
+
+=head1 AVAILABILITY
+
+The project homepage is L<https://metacpan.org/release/HTML-Scrubber>.
+
+The latest version of this module is available from the Comprehensive Perl
+Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
+site near you, or see L<https://metacpan.org/module/HTML::Scrubber/>.
+
+=head1 AUTHORS
+
+=over 4
+
+=item *
+
+Ruslan Zakirov <Ruslan.Zakirov@gmail.com>
+
+=item *
+
+Nigel Metheringham <nigelm@cpan.org>
+
+=item *
+
+D. H. <podmaster@cpan.org>
+
+=back
+
+=head1 COPYRIGHT AND LICENSE
+
+This software is copyright (c) 2015 by Ruslan Zakirov, Nigel Metheringham, 2003-2004 D. H..
+
+This is free software; you can redistribute it and/or modify it under
+the same terms as the Perl 5 programming language system itself.
+
+=cut