]> www.dxcluster.org Git - spider.git/blob - perl/BadWords.pm
Merge branch 'master' of /scm/spider
[spider.git] / perl / BadWords.pm
1 #
2 # Search for bad words in strings
3 #
4 # Copyright (c) 2000 Dirk Koopman
5 #
6 #
7 #
8
9 package BadWords;
10
11 use strict;
12
13 use DXUtil;
14 use DXVars;
15 use DXHash;
16 use DXDebug;
17
18 use IO::File;
19
20 use vars qw($badword $regexcode);
21
22 my $oldfn = "$main::data/badwords";
23 my $regex = "$main::data/badw_regex";
24 my $bwfn = "$main::data/badword";
25
26 # copy issue ones across
27 filecopy("$regex.gb.issue", $regex) unless -e $regex;
28 filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
29
30 $badword = new DXHash "badword";
31
32 # load the badwords file
33 sub load
34 {
35         my @out;
36         my $fh = new IO::File $oldfn;
37         
38         if ($fh) {
39                 while (<$fh>) {
40                         chomp;
41                         next if /^\s*\#/;
42                         my @list = split " ";
43                         for (@list) {
44                                 $badword->add($_);
45                         }
46                 }
47                 $fh->close;
48                 $badword->put;
49                 unlink $oldfn;
50         }
51         push @out, create_regex(); 
52         return @out;
53 }
54
55 sub create_regex
56 {
57         my @out;
58         my $fh = new IO::File $regex;
59         
60         if ($fh) {
61                 my $s = "sub { my \$str = shift; my \@out; \n";
62                 while (<$fh>) {
63                         chomp;
64                         next if /^\s*\#/;
65                         my @list = split " ";
66                         for (@list) {
67                                 # create a closure for each word so that it matches stuff with spaces/punctuation
68                                 # and repeated characters in it
69                                 my $w = uc $_;
70                                 my @l = split //, $w;
71                                 my $e = join '+[\s\W]*', @l;
72                                 $s .= "push \@out, \$1 if \$str =~ /\\b($e)/;\n";
73                         }
74                 }
75                 $s .= "return \@out;\n}";
76                 $regexcode = eval $s;
77                 dbg($s) if isdbg('badword');
78                 if ($@) {
79                         @out = ($@);
80                         dbg($@);
81                         return @out;
82                 }
83                 $fh->close;
84         } else {
85                 my $l = "can't open $regex $!";
86                 dbg($l);
87                 push @out, $l;
88         }
89         
90         return @out;
91 }
92
93 # check the text against the badwords list
94 sub check
95 {
96         my $s = uc shift;
97         my @out;
98
99         push @out, &$regexcode($s) if $regexcode;
100         
101         return @out if @out;
102         
103         for (split(/\b/, $s)) {
104                 push @out, $_ if $badword->in($_);
105         }
106
107         return @out;
108 }
109
110 1;