package HTML::Scrubber; # ABSTRACT: Perl extension for scrubbing/sanitizing html use 5.008; # enforce minimum perl version of 5.8 use strict; use warnings; use HTML::Parser 3.47 (); use HTML::Entities; use Scalar::Util ('weaken'); our ( @_scrub, @_scrub_fh ); our $VERSION = '0.15'; # VERSION our $AUTHORITY = 'cpan:NIGELM'; # AUTHORITY # my my my my, these here to prevent foolishness like # http://perlmonks.org/index.pl?node_id=251127#Stealing+Lexicals (@_scrub) = ( \&_scrub, "self, event, tagname, attr, attrseq, text" ); (@_scrub_fh) = ( \&_scrub_fh, "self, event, tagname, attr, attrseq, text" ); sub new { my $package = shift; my $p = HTML::Parser->new( api_version => 3, default_h => \@_scrub, marked_sections => 0, strict_comment => 0, unbroken_text => 1, case_sensitive => 0, boolean_attribute_value => undef, empty_element_tags => 1, ); my $self = { _p => $p, _rules => { '*' => 0, }, _comment => 0, _process => 0, _r => "", _optimize => 1, _script => 0, _style => 0, }; $p->{"\0_s"} = bless $self, $package; weaken( $p->{"\0_s"} ); return $self unless @_; my (%args) = @_; for my $f (qw[ default allow deny rules process comment ]) { next unless exists $args{$f}; if ( ref $args{$f} ) { $self->$f( @{ $args{$f} } ); } else { $self->$f( $args{$f} ); } } return $self; } sub comment { return $_[0]->{_comment} if @_ == 1; $_[0]->{_comment} = $_[1]; return; } sub process { return $_[0]->{_process} if @_ == 1; $_[0]->{_process} = $_[1]; return; } sub script { return $_[0]->{_script} if @_ == 1; $_[0]->{_script} = $_[1]; return; } sub style { return $_[0]->{_style} if @_ == 1; $_[0]->{_style} = $_[1]; return; } sub allow { my $self = shift; for my $k (@_) { $self->{_rules}{ lc $k } = 1; } $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse return; } sub deny { my $self = shift; for my $k (@_) { $self->{_rules}{ lc $k } = 0; } $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse return; } sub rules { my $self = shift; my (%rules) = @_; for my $k ( keys %rules ) { $self->{_rules}{ lc $k } = $rules{$k}; } $self->{_optimize} = 1; # each time a rule changes, reoptimize when parse return; } sub default { return $_[0]->{_rules}{'*'} if @_ == 1; $_[0]->{_rules}{'*'} = $_[1] if defined $_[1]; $_[0]->{_rules}{'_'} = $_[2] if defined $_[2] and ref $_[2]; $_[0]->{_optimize} = 1; # each time a rule changes, reoptimize when parse return; } sub scrub_file { if ( @_ > 2 ) { return unless defined $_[0]->_out( $_[2] ); } else { $_[0]->{_p}->handler( default => @_scrub ); } $_[0]->_optimize(); #if $_[0]->{_optimize}; $_[0]->{_p}->parse_file( $_[1] ); return delete $_[0]->{_r} unless exists $_[0]->{_out}; print { $_[0]->{_out} } $_[0]->{_r} if length $_[0]->{_r}; delete $_[0]->{_out}; return 1; } sub scrub { if ( @_ > 2 ) { return unless defined $_[0]->_out( $_[2] ); } else { $_[0]->{_p}->handler( default => @_scrub ); } $_[0]->_optimize(); # if $_[0]->{_optimize}; $_[0]->{_p}->parse( $_[1] ) if defined( $_[1] ); $_[0]->{_p}->eof(); return delete $_[0]->{_r} unless exists $_[0]->{_out}; delete $_[0]->{_out}; return 1; } sub _out { my ( $self, $o ) = @_; unless ( ref $o and ref \$o ne 'GLOB' ) { open my $F, '>', $o or return; binmode $F; $self->{_out} = $F; } else { $self->{_out} = $o; } $self->{_p}->handler( default => @_scrub_fh ); return 1; } sub _validate { my ( $s, $t, $r, $a, $as ) = @_; return "<$t>" unless %$a; $r = $s->{_rules}->{$r}; my %f; for my $k ( keys %$a ) { my $check = exists $r->{$k} ? $r->{$k} : exists $r->{'*'} ? $r->{'*'} : next; if ( ref $check eq 'CODE' ) { my @v = $check->( $s, $t, $k, $a->{$k}, $a, \%f ); next unless @v; $f{$k} = shift @v; } elsif ( ref $check || length($check) > 1 ) { $f{$k} = $a->{$k} if $a->{$k} =~ m{$check}; } elsif ($check) { $f{$k} = $a->{$k}; } } if (%f) { my %seen; return "<$t $r>" if $r = join ' ', map { defined $f{$_} ? qq[$_="] . encode_entities( $f{$_} ) . q["] : $_; # boolean attribute (TODO?) } grep { exists $f{$_} and !$seen{$_}++; } @$as; } return "<$t>"; } sub _scrub_str { my ( $p, $e, $t, $a, $as, $text ) = @_; my $s = $p->{"\0_s"}; my $outstr = ''; if ( $e eq 'start' ) { if ( exists $s->{_rules}->{$t} ) # is there a specific rule { if ( ref $s->{_rules}->{$t} ) # is it complicated?(not simple;) { $outstr .= $s->_validate( $t, $t, $a, $as ); } elsif ( $s->{_rules}->{$t} ) # validate using default attribute rule { $outstr .= $s->_validate( $t, '_', $a, $as ); } } elsif ( $s->{_rules}->{'*'} ) # default allow tags { $outstr .= $s->_validate( $t, '_', $a, $as ); } } elsif ( $e eq 'end' ) { my $place = 0; if ( exists $s->{_rules}->{$t} ) { $place = 1 if $s->{_rules}->{$t}; } elsif ( $s->{_rules}->{'*'} ) { $place = 1; } if ($place) { if ( length $text ) { $outstr .= ""; } else { substr $s->{_r}, -1, 0, ' /'; } } } elsif ( $e eq 'comment' ) { if ( $s->{_comment} ) { # only copy comments through if they are well formed... $outstr .= $text if ( $text =~ m|^$|ms ); } } elsif ( $e eq 'process' ) { $outstr .= $text if $s->{_process}; } elsif ( $e eq 'text' or $e eq 'default' ) { $text =~ s//>/g; $outstr .= $text; } elsif ( $e eq 'start_document' ) { $outstr = ""; } return $outstr; } sub _scrub_fh { my $self = $_[0]->{"\0_s"}; print { $self->{_out} } $self->{'_r'} if length $self->{_r}; $self->{'_r'} = _scrub_str(@_); } sub _scrub { $_[0]->{"\0_s"}->{_r} .= _scrub_str(@_); } sub _optimize { my ($self) = @_; my (@ignore_elements) = grep { not $self->{"_$_"} } qw(script style); $self->{_p}->ignore_elements(@ignore_elements); # if @ is empty, we reset ;) return unless $self->{_optimize}; #sub allow # return unless $self->{_optimize}; # till I figure it out (huh) if ( $self->{_rules}{'*'} ) { # default allow $self->{_p}->report_tags(); # so clear it } else { my (@reports) = grep { # report only tags we want $self->{_rules}{$_} } keys %{ $self->{_rules} }; $self->{_p}->report_tags( # default deny, so optimize @reports ) if @reports; } # sub deny # return unless $self->{_optimize}; # till I figure it out (huh) my (@ignores) = grep { not $self->{_rules}{$_} } grep { $_ ne '*' } keys %{ $self->{_rules} }; $self->{_p}->ignore_tags( # always ignore stuff we don't want @ignores ) if @ignores; $self->{_optimize} = 0; return; } 1; #print sprintf q[ '%-12s => %s,], "$_'", $h{$_} for sort keys %h;# perl! #perl -ne"chomp;print $_;print qq'\t\t# test ', ++$a if /ok\(/;print $/" test.pl >test2.pl #perl -ne"chomp;print $_;if( /ok\(/ ){s/\#test \d+$//;print qq'\t\t# test ', ++$a }print $/" test.pl >test2.pl #perl -ne"chomp;if(/ok\(/){s/# test .*$//;print$_,qq'\t\t# test ',++$a}else{print$_}print$/" test.pl >test2.pl __END__ =pod =for stopwords html cpan callback homepage Perlbrew perltidy respository =head1 NAME HTML::Scrubber - Perl extension for scrubbing/sanitizing html =head1 VERSION version 0.15 =head1 SYNOPSIS use HTML::Scrubber; my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); print $scrubber->scrub('

bold missing

'); # output is:

bold

# more complex input my $html = q[
a => link br =>
b => bold u => UNDERLINE ]; print $scrubber->scrub($html); $scrubber->deny( qw[ p b i u hr br ] ); print $scrubber->scrub($html); =head1 DESCRIPTION If you want to "scrub" or "sanitize" html input in a reliable and flexible fashion, then this module is for you. I wasn't satisfied with HTML::Sanitizer because it is based on HTML::TreeBuilder, so I thought I'd write something similar that works directly with HTML::Parser. =head1 METHODS First a note on documentation: just study the L below. It's all the documentation you could need Also, be sure to read all the comments as well as L. If you're new to perl, good luck to you. =head2 comment warn "comments are ", $p->comment ? 'allowed' : 'not allowed'; $p->comment(0); # off by default =head2 process warn "process instructions are ", $p->process ? 'allowed' : 'not allowed'; $p->process(0); # off by default =head2 script warn "script tags (and everything in between) are supressed" if $p->script; # off by default $p->script( 0 || 1 ); B<**> Please note that this is implemented using HTML::Parser's ignore_elements function, so if C