Last active
December 10, 2015 17:48
-
-
Save mediaczar/4470129 to your computer and use it in GitHub Desktop.
Count Bigrams (based on original by T. J. Otlogetswe)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| # bigramcount - counts the number of bigrams in a text, | |
| # prints them out in order of decreasing frequency. | |
| # author: Thapelo J. Otlogetswe | |
| # date: 2006-01-27 | |
| # http://thaps.blogspot.co.uk/2006/01/perl-bigram-count.html | |
| # modified: Mat Morrison (@mediaczar) | |
| # date: 2013-01-06 | |
| use strict; | |
| use warnings; | |
| my %count; | |
| my $word1; | |
| my $word2 =""; | |
| while(<>) { | |
| chomp; | |
| tr/A-Z/a-z/; | |
| tr/.,:;!&?"'(){}//d; | |
| s/ - //g; | |
| foreach $word1 (split) { | |
| my $bigram = "$word2 $word1"; | |
| $word2 = $word1; | |
| $count{$bigram}++; | |
| } | |
| } | |
| foreach my $bigram (sort numerically keys %count) { | |
| print "$count{$bigram}\t$bigram\n"; | |
| } | |
| sub numerically { # compare two words numerically | |
| $count{$b} <=> $count{$a}; # decreasing order | |
| # $count{$b} <=> $count{$a}; # increasing order | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| # takes a space-delimited word list (FILE) formatted | |
| # <count> <word> | |
| # and removes lines with stopwords supplied in STOPWORDS | |
| # author: Mat Morrison (@mediaczar) | |
| # date: 2013-01-01 | |
| use strict; | |
| use warnings; | |
| my @stopwords; | |
| my %ignore = map { $_ => 1 } qw (rt not); | |
| open(FILE, "<$ARGV[0]"); | |
| open(STOPWORDS, "<$ARGV[1]"); | |
| while (<STOPWORDS>) { | |
| chomp; | |
| if (not exists $ignore{$_}) { | |
| push (@stopwords, $_); | |
| } | |
| } | |
| my %stopwords = map { $_ => 1 } @stopwords; | |
| foreach my $line (<FILE>) { | |
| chomp $line; | |
| $line =~ s/^\s+//; # trim leading spaces | |
| $line =~ /^[^\s]*\s(\W?\w['\w-]*)\s(\W?\w['\w-]*)$/i; # find words | |
| if (not exists $stopwords{$1} and not exists $stopwords{$2} and not exists $ignore{$2}) { | |
| print $line . "\n"; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment