mediaczar · December 10, 2015 17:48
diff --git a/countbigram.pl b/countbigram.pl
 #!/usr/bin/perl

 # bigramcount - counts the number of bigrams in a text,
 # prints them out in order of decreasing frequency.
 # author: Thapelo J. Otlogetswe
 # date: 2006-01-27
 # http://thaps.blogspot.co.uk/2006/01/perl-bigram-count.html
 # modified: Mat Morrison (@mediaczar)
 # date: 2013-01-06

 use strict;
 use warnings;

 my %count;
 my $word1;
 my $word2 ="";

 while(<>) {
 	chomp;
 	tr/A-Z/a-z/;
 	tr/.,:;!&?"'(){}//d;
 	s/ - //g;
 	foreach $word1 (split) {
 		my $bigram = "$word2 $word1";
 		$word2 = $word1;
 		$count{$bigram}++;
 	}
 }
 foreach my $bigram (sort numerically keys %count) {
 	print "$count{$bigram}\t$bigram\n";
 }
 sub numerically { # compare two words numerically
 	$count{$b} <=> $count{$a}; # decreasing order
 	# $count{$b} <=> $count{$a}; # increasing order
 }
diff --git a/remove_stopwords_bigrams.pl b/remove_stopwords_bigrams.pl
 #!/usr/bin/perl

 # takes a space-delimited word list (FILE) formatted
 # <count> <word>
 # and removes lines with stopwords supplied in STOPWORDS
 # author: Mat Morrison (@mediaczar)
 # date: 2013-01-01

 use strict;
 use warnings;
 my @stopwords;

 my %ignore = map { $_ => 1 } qw (rt not);

 open(FILE, "<$ARGV[0]");
 open(STOPWORDS, "<$ARGV[1]");
 while (<STOPWORDS>) {
  chomp;
 	if (not exists $ignore{$_}) {
 		push (@stopwords, $_);
 	}
 }

 my %stopwords = map { $_ => 1 } @stopwords;

 foreach my $line (<FILE>) {
 	chomp $line;
 	$line =~ s/^\s+//; # trim leading spaces
 	$line =~ /^[^\s]*\s(\W?\w['\w-]*)\s(\W?\w['\w-]*)$/i; # find words
 	if (not exists $stopwords{$1} and not exists $stopwords{$2} and not exists $ignore{$2}) {
 		print $line . "\n";
 	}
 }
	#!/usr/bin/perl

	# bigramcount - counts the number of bigrams in a text,
	# prints them out in order of decreasing frequency.
	# author: Thapelo J. Otlogetswe
	# date: 2006-01-27
	# http://thaps.blogspot.co.uk/2006/01/perl-bigram-count.html
	# modified: Mat Morrison (@mediaczar)
	# date: 2013-01-06

	use strict;
	use warnings;

	my %count;
	my $word1;
	my $word2 ="";

	while(<>) {
	chomp;
	tr/A-Z/a-z/;
	tr/.,:;!&?"'(){}//d;
	s/ - //g;
	foreach $word1 (split) {
	my $bigram = "$word2 $word1";
	$word2 = $word1;
	$count{$bigram}++;
	}
	}
	foreach my $bigram (sort numerically keys %count) {
	print "$count{$bigram}\t$bigram\n";
	}
	sub numerically { # compare two words numerically
	$count{$b} <=> $count{$a}; # decreasing order
	# $count{$b} <=> $count{$a}; # increasing order
	}
	#!/usr/bin/perl

	# takes a space-delimited word list (FILE) formatted
	# <count> <word>
	# and removes lines with stopwords supplied in STOPWORDS
	# author: Mat Morrison (@mediaczar)
	# date: 2013-01-01

	use strict;
	use warnings;
	my @stopwords;

	my %ignore = map { $_ => 1 } qw (rt not);

	open(FILE, "<$ARGV[0]");
	open(STOPWORDS, "<$ARGV[1]");
	while (<STOPWORDS>) {
	chomp;
	if (not exists $ignore{$_}) {
	push (@stopwords, $_);
	}
	}

	my %stopwords = map { $_ => 1 } @stopwords;

	foreach my $line (<FILE>) {
	chomp $line;
	$line =~ s/^\s+//; # trim leading spaces
	$line =~ /^[^\s]\s(\W?\w['\w-])\s(\W?\w['\w-]*)$/i; # find words
	if (not exists $stopwords{$1} and not exists $stopwords{$2} and not exists $ignore{$2}) {
	print $line . "\n";
	}
	}