1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
#!/usr/bin/perl use Modern::Perl; no warnings; no if $] >= 5.017011, warnings => 'experimental::smartmatch'; # do singleton check and possible bailout as early as possible. use Fcntl qw(:flock); my $lockfile = 'E:\\FlowData\\PreProcess\\classifier.lock'; sub BailOut { print "$0 is already running. Exiting.\n"; exit(1); } open( my $fhpid, '>>', $lockfile ) or die "error: open '$lockfile': $!"; flock( $fhpid, LOCK_EX | LOCK_NB ) or BailOut(); use strict; no warnings; use AI::Categorizer; use AI::Categorizer::Collection::Files; use Algorithm::NaiveBayes::Model::Frequency; use File::Spec; use File::Copy::Vigilant qw(copy move); use File::ChangeNotify; use File::Basename; our $cat_file = 'E:\\FlowData\\PreProcess\\cats.txt'; our $path = 'E:\\FlowData\\Inbound'; my $watcher = File::ChangeNotify->instantiate_watcher( directories => [$path], filter => qr/\.(?:xml|txt)$/, exclude => [ qr/(?:test|temp|audit|error)/is ], ); use AI::Categorizer::Learner::NaiveBayes; our %files; our $nb = AI::Categorizer::Learner::NaiveBayes->restore_state('E:\\FlowData\\PreProcess\\state'); print "Begining to watch $path for changes.\n"; while ( my @events = $watcher->wait_for_events() ) { sleep(5); foreach my $event (@events) { if ( $event->type() eq "create" ) { sleep(2); my ( $filename, $dirs, $suffix ) = fileparse( $event->path() ); print "Detected " . $event->type() . " on ".$filename."\n"; if ( $filename =~ m/\.txt$/ ) { print "Moving assumed journal $filename\n"; move( $dirs . $filename, "E:\\FlowData\\Journal\\" . $filename, check => 'md5', retries => 'infinite' ); } else { open my $out_fh, '>', $cat_file; opendir( DIR, $path ) or die $!; while ( my $file = readdir(DIR) ) { # Use a regular expression to ignore files beginning with a period next if ( $file =~ m/^\./ ); print {$out_fh} "$file \n"; } closedir(DIR); close $out_fh; my $c = new AI::Categorizer::Collection::Files( path => $path, category_file => $cat_file ); while ( my $document = $c->next ) { my $hypothesis = $nb->categorize($document); print "Classified ", $hypothesis->document_name(); print " as ", $hypothesis->best_category, "\n"; my $original = $path . "\\" . $hypothesis->document_name(); my $classified = 'E:\\FlowData\\PreProcess\\' . $hypothesis->best_category() . '\\' . $hypothesis->document_name(); if ( $original =~ /^(.*)$/ ) { $original = $1; # $data now untainted } else { die "Bad data in $original"; # log this somewhere } if ( $classified =~ /^(.*)$/ ) { $classified = $1; # $data now untainted } else { die "Bad data in $classified"; # log this somewhere } move( $original, $classified, check => 'md5', retries => 'infinite' ); } } } } } |
Classification at it’s simplest.