Date: Thu, 04 Oct 2001 09:32:21 GMT
From: "fishy" <joerivdv007@hotmail.com>
Subject: bigram statistical package by pedersen
Message-Id: <F0Wu7.84814$6x5.18846076@afrodite.telenet-ops.be>

Hi, if anyone is familiar with the bigram statistical package by Pedersen,
maybe they can help me.

My problem is the following:

the count.pl offers one the possibility to count bigrams which are not
adjacent and for the user to set the distance, which is allowed between the
words. Suppose on sets the distance at 5, the programme will count bigrams
for a certain word up to 4 words to the right. E.g. The dog eats bones. The
programm will count the following bigrams :
(the, dog)
(the, eats)
(the, bones) and
(the, .)

I want to change the code so that the programme adds a weight to more
distant bigrams. For the example above, the bigram (the, eats) is a
distance3 bigram and should e.g. get the weight 0.5. So instead of adding 1
to the total of the bigrams containing (the, eats), it only adds 0.5, and so
on. I have altered the code, but it doesn't work. Can anyone tell me what
I'm doing wrong?

Here is the code:

#!/usr/local/bin/perl -w

# count.pl version 0.4

# Program to take one or more text files and calculate the bigram frequency

# for the whole corpus.

#

# Copyright (C) 2000-2001

# Ted Pedersen, University of Minnesota, Duluth

# tpederse@d.umn.edu

# Satanjeev Banerjee, University of Minnesota, Duluth

# bane0025@d.umn.edu

#

# This program is free software; you can redistribute it and/or

# modify it under the terms of the GNU General Public License

# as published by the Free Software Foundation; either version 2

# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

#

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

#

#---------------------------------------------------------------------------
--

# Start of program

#---------------------------------------------------------------------------
--

# have to be able to use unicode... so turn on unicode support...!

# use utf8;

# utf8 turned off by tdp since regexes are not using utf8 \p{} due

# to the poor efficiency that results in regex mataching

# we have to use commandline options, so use the necessary package!

use Getopt::Long;

# first check if no commandline options have been provided... in which case

# print out the usage notes!

if ( $#ARGV == -1 )

{

&minimalUsageNotes();

exit;

}

# now get the options!

GetOptions("verbose", "recurse", "version", "help", "histogram=s",

"frequency=i", "window=i", "stop=s", "newLine", "extended",

"token=s"

);

# set the variables according to what has been provided!

if ( defined $opt_verbose ) { $verbose = $opt_verbose; }

else { $verbose = 0; }

if ( defined $opt_recurse ) { $recurse = $opt_recurse; }

else { $recurse = 0; }

if ( defined $opt_version ) { $version = $opt_version; }

else { $version = 0; }

if ( defined $opt_help ) { $help = $opt_help; }

else { $help = 0; }

if ( defined $opt_frequency ) { $cutOff = $opt_frequency; }

else { $cutOff = 0; }

if ( defined $opt_histogram )

{

$enableHistogram = 1;

$histFile = $opt_histogram;

}

else { $enableHistogram = 0; }

if ( defined $opt_window ) { $windowSize = $opt_window; }

else { $windowSize = 2; }

if ( defined $opt_stop )

{

$stopList = $opt_stop;

if (!(-e $stopList ))

{

print "Cant find stoplist file $stopList.\n";

askHelp();

exit;

}

}

# now starts the fun...!

# if help has been requested, print out help!

if ( $help )

{

&showHelp();

exit;

}

# if version has been requested, show version!

if ( $version )

{

&showVersion();

exit;

}

# check if tokens file has been supplied. if so, try to open it and extract
the

# regex's.

if ( defined $opt_token )

{

if ( !( -e $opt_token))

{

print "Cant find token definition file $opt_token.\n";

askHelp();

exit;

}


open TOKEN, $opt_token || die "Couldnt open $opt_token\n";


while(<TOKEN>)

{

chomp;

s/^\s*//g;

s/\s*$//g;

if (length($_) <= 0) { next; }

if (!(/^\//) || !(/\/$/))

{

print STDERR "Ignoring regex with no delimiters: $_\n";

next;

}

s/^\///;

s/\/$//;

push @tokenRegex, $_;

}

close TOKEN;

}

else

{

push @tokenRegex, "\\p{IsWord}+";

push @tokenRegex, "\\p{IsPunct}";

}

# create the complete token regex

$tokenizerRegex = "";

foreach $token (@tokenRegex)

{

if ( length($tokenizerRegex) > 0 )

{

$tokenizerRegex .= "|";

}

$tokenizerRegex .= "(";

$tokenizerRegex .= $token;

$tokenizerRegex .= ")";

}

# if you dont have any tokens to work with, abort

if ( $#tokenRegex < 0 )

{

print "No token definitions to work with.\n";

askHelp();

exit;

}

# get/set the maximum token length

# $maxTokenLen = (defined $opt_maxTokenLen)?($opt_maxTokenLen):(100);

# having stripped the commandline of all the options et al, we should now be

# left only with the source/destination files

# so, first get hold of the destination file!

$destination = shift;

# check to see if a destination has been supplied at all...

if ( !($destination ) )

{

print "No file supplied. ";

askHelp();

exit;

}

# check to see if destination exists, and if so, if we should overwrite...

if ( -e $destination )

{

print "File $destination exists! Overwrite (Y/N)? ";

$reply = <STDIN>;

chomp $reply;

$reply = uc $reply;

exit 0 if ($reply ne "Y");

}

# having ascertained that we may open the destination file for output, lets
do

# so...

open ( DST, ">$destination" ) || die "couldnt open $destination";

# whats left in the command line are paths. go thru them and salvage all

# text files to be processed. the following function does just that, putting

# all useful files in @sourceFiles :o)

&getSourceFiles(@ARGV);

# output the files found, if verbose set!

if ( $verbose )

{

print "\nFound the following $index file(s) to source from: \n";

for ( $i = 0; $i < $index; $i ++ ) { print "$sourceFiles[$i]\n"; }

print "\n";

}

# set bigramTotal, which is the variable where we'll get the sample size.

$bigramTotal = 0;

# now get the source files one by one from @sourceFiles, and process them in
a

# loop!

foreach $source (@sourceFiles)

{

# we already know that the file exists... that is checked by

# &getSourceFiles, so no need to check it again! so just open the source

# file!!


open( SRC, "$source" ) || die "Coudlnt open $source, quitting";


# having successfully opened the source file start reading it...

if ( $verbose ) { print "Accessing file $source.\n"; }


# start off the window index which will tell us where in the window array

# we are right now!!!

$windex = 0; # the NEXT place in the array to write to!


# read in the file, tokenize and process the tokens all in one fell swoop

# fell because of the (1) below :)


while (<>)

{

if ( defined $opt_newLine )

{

$opt_newLine = 1;

$windex = 0;

}

while ( /$tokenizerRegex/g )

{

$token = $&;

processToken($token);

}

}

}

# that is the tokenizing and token-processing done!

# now to put in the stop list, if its been provided

if ( defined $stopList )

{

# we have already checked that the stop list exists... so go ahead and open
it!

open ( STP, $stopList ) ||

die ("Couldnt open the stoplist file $stopList\n");


$stopString = "";

while ( <STP> )

{

chomp;

s/^\s+//;

s/\s+$//;

if ( /^\// && /\/$/ )

{

s/^\///;

s/\/$//;

if ( !($stopString eq "") )

{

$stopString .= "|";

}

$stopString .= "(" . $_ . ")";

}

}

close STP;

# having got the file, go thru the pair hash, getting rid of the

# offending bigrams

foreach ( keys %pairFreq )

{

if ( $_ =~ /<>/ )

{

$tempArray[0] = $`;

$tempArray[1] = $';

}

else

{

print STDERR "Fatal internal error! Aborting!\n";

exit;

}


if ( ( $tempArray[0] =~ /^($stopString)$/ ) && ( $tempArray[1] =~
/^($stopString)$/ ) )

{

# reduce bigram total by the frequency of this bigram

$bigramTotal -= $pairFreq{$_};

# reduce left and right counts (and delete keys if

# they become zero) by the frequency of this bigram

$leftFreq{$tempArray[0]} -= $pairFreq{$_};

if ( $leftFreq{$tempArray[0]} <= 0 )

{

delete $leftFreq{$tempArray[0]};

}


$rightFreq{$tempArray[1]} -= $pairFreq{$_};

if ( $rightFreq{$tempArray[1]} <= 0 )

{

delete $rightFreq{$tempArray[1]};

}

# remove this bigram!

delete $pairFreq{$_};

}

}

}

# now to introduce frequency cut off's!

foreach ( keys %pairFreq )

{

if ( $pairFreq{$_} < $cutOff )

{

$bigramTotal -= $pairFreq{$_};

if ( $_ =~ /<>/ )

{

$tempArray[0] = $`;

$tempArray[1] = $';

}

else

{

print STDERR "Fatal internal error! Aborting!\n";

exit;

}

$leftFreq{$tempArray[0]} -= $pairFreq{$_};

if ( $leftFreq{$tempArray[0]} <= 0 )

{

delete $leftFreq{$tempArray[0]};

}


$rightFreq{$tempArray[1]} -= $pairFreq{$_};

if ( $rightFreq{$tempArray[1]} <= 0 )

{

delete $rightFreq{$tempArray[1]};

}

# remove this bigram!

delete $pairFreq{$_};


}

}

# end of processing all the files. now to write out the information.

if ( $verbose ) { print "Writing to $destination.\n"; }

if ( defined $opt_extended )

{

$opt_extended *= 1;

# print out the window size used

print DST "\@count.WindowSize=$windowSize\n";

# print out the frequency cut off used

print DST "\@count.FrequencyCut=$cutOff\n";

}

# finally print out the total bigrams

print DST "$bigramTotal\n";

foreach ( sort { $pairFreq{$b} <=> $pairFreq{$a} } keys %pairFreq )

{

if ( $_ =~ /<>/ )

{

$tempArray[0] = $`;

$tempArray[1] = $';

}

else

{

print STDERR "Fatal internal error! Aborting!\n";

exit;

}

# bit stuffing... if a line starts with a single @, its a command (extended

# output). if it starts with two consequtive @'s, then its a single
'literal' @.

$WeighedTotal = $pairfreq + $Dist3FreqWeight + $Dist4FreqWeight +
$Dist5FreqWeight;

if ( $_ =~ /^@/ ) { print DST "@"; }

print DST "$__"; # bigram

print DST "$pairFreq{$_} "; # bigram freq

print DST "$Dist3FreqWeight{$_} ";

print DST "$Dist4FreqWeight{$_} ";

print DST "$Dist5FreqWeight{$_} ";

print DST "$WeighedTotal{$_} ";


print DST "$leftFreq{$tempArray[0]} "; # single freq of first word

print DST "$rightFreq{$tempArray[1]}\n"; # single freq of second word

}

foreach ( sort { $pairDist3Freq{$b} <=> $pairDist3Freq{$a} } keys
%pairDist3Freq )

{

if ( $_ =~ /<>/ )

{

$tempArray[0] = $`;

$tempArray[1] = $';

}

else

{

print STDERR "Fatal internal error! Aborting!\n";

exit;

}

# bit stuffing... if a line starts with a single @, its a command (extended

# output). if it starts with two consequtive @'s, then its a single
'literal' @.

$WeighedTotal = $pairfreq + $Dist3FreqWeight + $Dist4FreqWeight +
$Dist5FreqWeight;

if ( $_ =~ /^@/ ) { print DST "@"; }

print DST "$__"; # bigram

print DST "$pairFreq{$_} "; # bigram freq

print DST "$Dist3FreqWeight{$_} ";

print DST "$Dist4FreqWeight{$_} ";

print DST "$Dist5FreqWeight{$_} ";

print DST "$WeighedTotal{$_} ";


print DST "$leftFreq{$tempArray[0]} "; # single freq of first word

print DST "$rightFreq{$tempArray[1]}\n"; # single freq of second word

}

foreach ( sort { $pairDist4Freq{$b} <=> $pairDist4Freq{$a} } keys
%pairDist4Freq )

{

if ( $_ =~ /<>/ )

{

$tempArray[0] = $`;

$tempArray[1] = $';

}

else

{

print STDERR "Fatal internal error! Aborting!\n";

exit;

}

# bit stuffing... if a line starts with a single @, its a command (extended

# output). if it starts with two consequtive @'s, then its a single
'literal' @.

$WeighedTotal = $pairfreq + $Dist3FreqWeight + $Dist4FreqWeight +
$Dist5FreqWeight;

if ( $_ =~ /^@/ ) { print DST "@"; }

print DST "$__"; # bigram

print DST "$pairFreq{$_} "; # bigram freq

print DST "$Dist3FreqWeight{$_} ";

print DST "$Dist4FreqWeight{$_} ";

print DST "$Dist5FreqWeight{$_} ";

print DST "$WeighedTotal{$_} ";


print DST "$leftFreq{$tempArray[0]} "; # single freq of first word

print DST "$rightFreq{$tempArray[1]}\n"; # single freq of second word

}

foreach ( sort { $pairDist5Freq{$b} <=> $pairDist5Freq{$a} } keys
%pairDist5Freq )

{

if ( $_ =~ /<>/ )

{

$tempArray[0] = $`;

$tempArray[1] = $';

}

else

{

print STDERR "Fatal internal error! Aborting!\n";

exit;

}



# bit stuffing... if a line starts with a single @, its a command (extended

# output). if it starts with two consequtive @'s, then its a single
'literal' @.

$WeighedTotal = $pairfreq + $Dist3FreqWeight + $Dist4FreqWeight +
$Dist5FreqWeight;

if ( $_ =~ /^@/ ) { print DST "@"; }

print DST "$__"; # bigram

print DST "$pairFreq{$_} "; # bigram freq

print DST "$Dist3FreqWeight{$_} ";

print DST "$Dist4FreqWeight{$_} ";

print DST "$Dist5FreqWeight{$_} ";

print DST "$WeighedTotal{$_} ";


print DST "$leftFreq{$tempArray[0]} "; # single freq of first word

print DST "$rightFreq{$tempArray[1]}\n"; # single freq of second word

}

# having done it all, close all open files...

close SRC;

close DST;

# now check to see if we need to output the histogram... if not leave!

if ( !($enableHistogram) ) { exit; }

# now see if the histogram file exists, and if so if we can overwrite it...

if ( -e $histFile )

{

print "File $histFile exists! Overwrite (Y/N)? ";

$reply = <STDIN>;

chomp $reply;

$reply = uc $reply;

exit 0 if ($reply ne "Y");

}

# having ascertained that we may open the histogram file for output, lets do

# so...

open ( HST, ">$histFile" ) || die "couldnt open $histFile";

# now to construct the histogram hash...

$maxFreq = 0;

foreach ( keys %pairFreq )

{

$histogram{$pairFreq{$_}}++;

if ( $pairFreq{$_} > $maxFreq ) { $maxFreq = $pairFreq{$_}; }

}

# having done that, lets print out to the hashFile...

print HST "Total bigrams = $bigramTotal\n";

for ( $i = 1; $i <= $maxFreq; $i++ )

{

if ( exists $histogram{$i} )

{

printf HST "Number of bigrams that occurred %3d time(s) = %5d (%.2f
percent)\n", $i, $histogram{$i}, ($histogram{$i}*$i*100)/$bigramTotal;

}

}

close HST;

# ... and thats it! :o)



#---------------------------------------------------------------------------
--

# User Defined Function Definitions

#---------------------------------------------------------------------------
--



# function to process tokens

sub processToken

{

my $token = shift;

# first put the word into the window array!

$window[$windex] = $token;


# if this is the first word of the corpus, just keep going!

if ( $windex == 0 )

{

$windex++;

return;

}


# otherwise, create the bigrams! the word that's just come in will

# go in $collocation[1], while we will successively put the rest

# of the array into $collocation[0] to create all the bigrams one

# by one!


$collocation[1] = $window[$windex];


for ( $i = 0; $i < $windex; $i++ )

{

$collocation[0] = $window[$i];


# create the bigram string in $pair...

$pair = $collocation[0] . "_" . $collocation[1];


# take into account how far apart the two words are

# when forming the bigram and multiply that count

# with a weighing factor

if ($windex == 2) {


# increment the total bigram count.

$bigramTotal++;

# increment the frequency count of this bigram

$pairFreq{$pair}++;

}

if ($windex == 3) {

$bigramTotal++;

# increment the frequency count of this bigram

$pairDist3Freq{$pair}++;

# multiply the frequency of this bigram with the weight

# for distance 3 bigrams

$Dist3FreqWeight = $pairDist3Freq * 0.5;

}

if ($windex == 4) {

$bigramTotal++;

# increment the frequency count of this bigram

$pairDist4Freq{$pair}++;

# multiply the frequency of this bigram with the weight

# for distance 4 bigrams

$Dist4FreqWeight = $pairDist4Freq * 0.25;

}

if ($windex == 5) {

$bigramTotal++;

# increment the frequency count of this bigram

$pairDist5Freq{$pair}++;

# multiply the frequency of this bigram with the weight

# for distance 5 bigrams

$Dist5FreqWeight = $pairDist5Freq * 0.125;

}


# increment the left count and right counts of the two words

# that make up the bigram

$leftFreq{$collocation[0]}++;

$rightFreq{$collocation[1]}++;

# $pairFreq{$pair}++;

}


# having dealt with all the bigrams, increment the windex, if less

# than the size, or shift out the first element of the array to

# make place for the next word thats coming in!


# if ( $windex < $windowSize - 1 ) { $windex++; }

# else { shift @window; }

}

# Function &getSourceFiles: function to take the command tail and

# return an array of text files to be used to count! while going thru the

# command line do the following processing:

#

# 1> if the string is a text file and can be opened, add it to the array.

# 2> if the string is a directory name, find all text files in that
directory,

# and append to array.

# 3> if the -r (recursive) option is set, go into all subdirectories of that

# directory too, to do the above!

sub getSourceFiles

{

# get the next commandline string...

my $nextString = shift;

$index = 0;


while ( $nextString )

{

if ( !( -e $nextString ) )

{

# file doesn't exist... ignore!


if ( $verbose ) { print "File $nextString does not exist!\n"; }

$nextString = shift;

next;

}


if ( !( -r $nextString ) )

{

# file can't be read... ignore!

if ( $verbose ) { print "File $nextString cant be read!\n"; }

$nextString = shift;

next;

}


if ( -d $nextString )

{

# this is a directory, go and search this directory for text files

&directorySearch( $nextString );

$nextString = shift;

next;

}


if ( !( -T $nextString ) )

{

# file is not a text file... ignore!

if ( $verbose ) { print "$nextString is not a text file!\n"; }

$nextString = shift;

next;

}


$sourceFiles[$index] = $nextString;

$index++;

$nextString = shift;

}

}

# function to (possibly recursively) search inside the given directory for

# text files

sub directorySearch

{

my $directory = $_[0];


opendir DIR, $directory || "Couldnt open directory $directory!\n";

my @files = grep !/^\./, readdir DIR;

@files = map "$directory/$_", @files;

closedir DIR;


my $file = "";


foreach $file (@files)

{

if ( ( -d $file ) && ( $recurse ) ) { &directorySearch($file); }

if ( ( -T $file ) )

{

$sourceFiles[$index] = $file;

$index++;

}

}

}



# function to output a minimal usage note when the user has not provided any

# commandline options

sub minimalUsageNotes

{

print "Usage: count.pl [OPTIONS] DESTINATION SOURCE [[, SOURCE] ...]\n";

askHelp();

}

# function to output help messages for this program

sub showHelp

{

print "Usage: count.pl [OPTIONS] DESTINATION SOURCE [[, SOURCE] ...]\n\n";

print "Counts the frequency of bigrams occurring in SOURCE and outputs
results to

DESTINATION. SOURCE may be a file or a directory. If a directory is
specified

as SOURCE, all text files in that directory (and all sub directories, if

--recurse is set) are counted. Optionally outputs a histogram to FILE (see

option --histogram below).\n\n";

print "OPTIONS:\n\n";

print " --verbose Switches to verbose mode.\n\n";

print " --version Prints the version number.\n\n";

print " --help Prints this help message.\n\n";

print " --recurse Recursively searches thru all directories mentioned

in the commandline for text files to count from.\n\n";

print " --frequency N Ignores all bigrams of frequency < N.\n\n";

print " --histogram FILE Outputs histogram to FILE. Tabulates how many times

bigrams of a given frequency have occured.\n\n";

print " --window N Sets window size to N. N = 2 by default.\n\n";

print " --stop FILE Removes all bigrams made up entirely of words from

FILE. Words in FILE should be separated by new line.\n\n";

print " --newLine Prevents bigrams from spanning across new-line\n";

print " characters.\n\n";

print " --token FILE Picks up regular expressions from FILE to use to\n";

print " tokenize the input text file. By default two\n";

print " regular expressions are provided.\n\n";

print " --extended Outputs chosen window size and frequency cut-off in

\"extended\" format. Warning: Extended output is not readable

by BSP version <= 0.3.\n\n";

}

# function to output the version number

sub showVersion

{

print "count.pl - Version 0.4\n";

print "A component of the BSP Version 0.4\n";

print "Copyright (C) 2000-2001, Ted Pedersen & Satanjeev Banerjee\n";

}

# function to output "ask for help" message when the user's goofed up!

sub askHelp

{

print "Type count.pl --help for help.\n";

}





