Last active
March 30, 2016 17:59
-
-
Save pichi/2d10c93242d5057913d026a607f07dd4 to your computer and use it in GitHub Desktop.
Stopwords Benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $ erl -pa eministat/ebin | |
| Erlang/OTP 18 [erts-7.3] [source] [64-bit] [smp:4:4] [async-threads:10] [hipe] [kernel-poll:false] | |
| Eshell V7.3 (abort with ^G) | |
| 1> {ok, Bin} = file:read_file("/home/hynek/Downloads/words.txt"), L = string:tokens(binary_to_list(Bin), "\s\r\n"), length(L). | |
| 113809 | |
| 2> length(lists:filter(fun stopwords_clause:is_stopword/1, L)). | |
| 122 | |
| 3> length(lists:filter(fun stopwords_map:is_stopword/1, L)). | |
| 122 | |
| 4> Clause = eministat:s("clause", fun() -> lists:filter(fun stopwords_clause:is_stopword/1, L) end, 50). | |
| {dataset,"clause", | |
| [3490,3493,3498,3501,3504,3507,3513,3539,3541,3544,3548, | |
| 3549,3551,3552,3554,3557,3559,3560,3562,3564,3570,3571,3581, | |
| 3589,3591,3611|...], | |
| 181508.0,6.595332e8,50} | |
| 5> Map = eministat:s("map", fun() -> lists:filter(fun stopwords_map:is_stopword/1, L) end, 50). | |
| {dataset,"map", | |
| [10950,10965,10971,10978,10982,10983,10988,10993,10998, | |
| 11002,11012,11013,11016,11017,11017,11019,11021,11025,11026, | |
| 11026,11028,11030,11035,11038,11040,11045|...], | |
| 555276.0,6170067514.0,50} | |
| 6> eministat:x(95.0, Clause, Map). | |
| x clause | |
| + map | |
| +--------------------------------------------------------------------------+ | |
| |xxxxx +++++ +| | |
| |xxxx ++++ | | |
| |xxxx +++ | | |
| |xxxx ++ | | |
| |xxx ++ | | |
| |xxx ++ | | |
| |xx ++ | | |
| |xx ++ | | |
| |xx ++ | | |
| |xx + | | |
| |xx + | | |
| |xx + | | |
| |xx + | | |
| |xx + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | x + | | |
| | + | | |
| | + | | |
| | + | | |
| | + | | |
| | + | | |
| | + | | |
| | + | | |
| | + | | |
| | + | | |
| ||A| | | |
| | |_MA_| | | |
| +--------------------------------------------------------------------------+ | |
| Dataset: x N=50 CI=95.0000 | |
| Statistic Value [ Bias] (Bootstrapped LB‥UB) | |
| Min: 3490.00 | |
| 1st Qu. 3551.00 | |
| Median: 3591.00 | |
| 3rd Qu. 3679.00 | |
| Max: 3945.00 | |
| Average: 3630.16 [ 0.137534] ( 3602.82 ‥ 3664.56) | |
| Std. Dev: 113.400 [ -1.81311] ( 90.8425 ‥ 141.539) | |
| Outliers: 0/4 = 4 (μ=3630.30, σ=111.587) | |
| Outlier variance: 0.151802 (moderate) | |
| ------ | |
| Dataset: + N=50 CI=95.0000 | |
| Statistic Value [ Bias] (Bootstrapped LB‥UB) | |
| Min: 1.09500e+4 | |
| 1st Qu. 1.10160e+4 | |
| Median: 1.10400e+4 | |
| 3rd Qu. 1.11270e+4 | |
| Max: 1.28270e+4 | |
| Average: 1.11055e+4 [ 0.297998] ( 1.10611e+4 ‥ 1.12491e+4) | |
| Std. Dev: 264.914 [ -31.0673] ( 84.7956 ‥ 582.629) | |
| Outliers: 0/2 = 2 (μ=1.11058e+4, σ=233.847) | |
| Outlier variance: 9.45082e-2 (slight) | |
| Difference at 95.0% confidence | |
| 7475.36 ± 80.8533 | |
| 205.924% ± 2.22726% | |
| (Student's t, pooled s = 203.763) | |
| ------ | |
| ok |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -module(stopwords_clause). | |
| -export([is_stopword/1]). | |
| is_stopword("a") -> true; | |
| is_stopword("about") -> true; | |
| is_stopword("above") -> true; | |
| is_stopword("after") -> true; | |
| is_stopword("again") -> true; | |
| is_stopword("against") -> true; | |
| is_stopword("all") -> true; | |
| is_stopword("am") -> true; | |
| is_stopword("an") -> true; | |
| is_stopword("and") -> true; | |
| is_stopword("any") -> true; | |
| is_stopword("are") -> true; | |
| is_stopword("aren't") -> true; | |
| is_stopword("as") -> true; | |
| is_stopword("at") -> true; | |
| is_stopword("be") -> true; | |
| is_stopword("because") -> true; | |
| is_stopword("been") -> true; | |
| is_stopword("before") -> true; | |
| is_stopword("being") -> true; | |
| is_stopword("below") -> true; | |
| is_stopword("between") -> true; | |
| is_stopword("both") -> true; | |
| is_stopword("but") -> true; | |
| is_stopword("by") -> true; | |
| is_stopword("can't") -> true; | |
| is_stopword("cannot") -> true; | |
| is_stopword("could") -> true; | |
| is_stopword("couldn't") -> true; | |
| is_stopword("did") -> true; | |
| is_stopword("didn't") -> true; | |
| is_stopword("do") -> true; | |
| is_stopword("does") -> true; | |
| is_stopword("doesn't") -> true; | |
| is_stopword("doing") -> true; | |
| is_stopword("don't") -> true; | |
| is_stopword("down") -> true; | |
| is_stopword("during") -> true; | |
| is_stopword("each") -> true; | |
| is_stopword("few") -> true; | |
| is_stopword("for") -> true; | |
| is_stopword("from") -> true; | |
| is_stopword("further") -> true; | |
| is_stopword("had") -> true; | |
| is_stopword("hadn't") -> true; | |
| is_stopword("has") -> true; | |
| is_stopword("hasn't") -> true; | |
| is_stopword("have") -> true; | |
| is_stopword("haven't") -> true; | |
| is_stopword("having") -> true; | |
| is_stopword("he") -> true; | |
| is_stopword("he'd") -> true; | |
| is_stopword("he'll") -> true; | |
| is_stopword("he's") -> true; | |
| is_stopword("her") -> true; | |
| is_stopword("here") -> true; | |
| is_stopword("here's") -> true; | |
| is_stopword("hers") -> true; | |
| is_stopword("herself") -> true; | |
| is_stopword("him") -> true; | |
| is_stopword("himself") -> true; | |
| is_stopword("his") -> true; | |
| is_stopword("how") -> true; | |
| is_stopword("how's") -> true; | |
| is_stopword("i") -> true; | |
| is_stopword("i'd") -> true; | |
| is_stopword("i'll") -> true; | |
| is_stopword("i'm") -> true; | |
| is_stopword("i've") -> true; | |
| is_stopword("if") -> true; | |
| is_stopword("in") -> true; | |
| is_stopword("into") -> true; | |
| is_stopword("is") -> true; | |
| is_stopword("isn't") -> true; | |
| is_stopword("it") -> true; | |
| is_stopword("it's") -> true; | |
| is_stopword("its") -> true; | |
| is_stopword("itself") -> true; | |
| is_stopword("let's") -> true; | |
| is_stopword("me") -> true; | |
| is_stopword("more") -> true; | |
| is_stopword("most") -> true; | |
| is_stopword("mustn't") -> true; | |
| is_stopword("my") -> true; | |
| is_stopword("myself") -> true; | |
| is_stopword("no") -> true; | |
| is_stopword("nor") -> true; | |
| is_stopword("not") -> true; | |
| is_stopword("of") -> true; | |
| is_stopword("off") -> true; | |
| is_stopword("on") -> true; | |
| is_stopword("once") -> true; | |
| is_stopword("only") -> true; | |
| is_stopword("or") -> true; | |
| is_stopword("other") -> true; | |
| is_stopword("ought") -> true; | |
| is_stopword("our") -> true; | |
| is_stopword("ours") -> true; | |
| is_stopword("ourselves") -> true; | |
| is_stopword("out") -> true; | |
| is_stopword("over") -> true; | |
| is_stopword("own") -> true; | |
| is_stopword("same") -> true; | |
| is_stopword("shan't") -> true; | |
| is_stopword("she") -> true; | |
| is_stopword("she'd") -> true; | |
| is_stopword("she'll") -> true; | |
| is_stopword("she's") -> true; | |
| is_stopword("should") -> true; | |
| is_stopword("shouldn't") -> true; | |
| is_stopword("so") -> true; | |
| is_stopword("some") -> true; | |
| is_stopword("such") -> true; | |
| is_stopword("than") -> true; | |
| is_stopword("that") -> true; | |
| is_stopword("that's") -> true; | |
| is_stopword("the") -> true; | |
| is_stopword("their") -> true; | |
| is_stopword("theirs") -> true; | |
| is_stopword("them") -> true; | |
| is_stopword("themselves") -> true; | |
| is_stopword("then") -> true; | |
| is_stopword("there") -> true; | |
| is_stopword("there's") -> true; | |
| is_stopword("these") -> true; | |
| is_stopword("they") -> true; | |
| is_stopword("they'd") -> true; | |
| is_stopword("they'll") -> true; | |
| is_stopword("they're") -> true; | |
| is_stopword("they've") -> true; | |
| is_stopword("this") -> true; | |
| is_stopword("those") -> true; | |
| is_stopword("through") -> true; | |
| is_stopword("to") -> true; | |
| is_stopword("too") -> true; | |
| is_stopword("under") -> true; | |
| is_stopword("until") -> true; | |
| is_stopword("up") -> true; | |
| is_stopword("very") -> true; | |
| is_stopword("was") -> true; | |
| is_stopword("wasn't") -> true; | |
| is_stopword("we") -> true; | |
| is_stopword("we'd") -> true; | |
| is_stopword("we'll") -> true; | |
| is_stopword("we're") -> true; | |
| is_stopword("we've") -> true; | |
| is_stopword("were") -> true; | |
| is_stopword("weren't") -> true; | |
| is_stopword("what") -> true; | |
| is_stopword("what's") -> true; | |
| is_stopword("when") -> true; | |
| is_stopword("when's") -> true; | |
| is_stopword("where") -> true; | |
| is_stopword("where's") -> true; | |
| is_stopword("which") -> true; | |
| is_stopword("while") -> true; | |
| is_stopword("who") -> true; | |
| is_stopword("who's") -> true; | |
| is_stopword("whom") -> true; | |
| is_stopword("why") -> true; | |
| is_stopword("why's") -> true; | |
| is_stopword("with") -> true; | |
| is_stopword("won't") -> true; | |
| is_stopword("would") -> true; | |
| is_stopword("wouldn't") -> true; | |
| is_stopword("you") -> true; | |
| is_stopword("you'd") -> true; | |
| is_stopword("you'll") -> true; | |
| is_stopword("you're") -> true; | |
| is_stopword("you've") -> true; | |
| is_stopword("your") -> true; | |
| is_stopword("yours") -> true; | |
| is_stopword("yourself") -> true; | |
| is_stopword("yourselves") -> true; | |
| is_stopword(_) -> false. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -module(stopwords_map). | |
| -export([is_stopword/1]). | |
| is_stopword(S) -> | |
| maps:get( | |
| S, | |
| #{ | |
| "a" => true, | |
| "about" => true, | |
| "above" => true, | |
| "after" => true, | |
| "again" => true, | |
| "against" => true, | |
| "all" => true, | |
| "am" => true, | |
| "an" => true, | |
| "and" => true, | |
| "any" => true, | |
| "are" => true, | |
| "aren't" => true, | |
| "as" => true, | |
| "at" => true, | |
| "be" => true, | |
| "because" => true, | |
| "been" => true, | |
| "before" => true, | |
| "being" => true, | |
| "below" => true, | |
| "between" => true, | |
| "both" => true, | |
| "but" => true, | |
| "by" => true, | |
| "can't" => true, | |
| "cannot" => true, | |
| "could" => true, | |
| "couldn't" => true, | |
| "did" => true, | |
| "didn't" => true, | |
| "do" => true, | |
| "does" => true, | |
| "doesn't" => true, | |
| "doing" => true, | |
| "don't" => true, | |
| "down" => true, | |
| "during" => true, | |
| "each" => true, | |
| "few" => true, | |
| "for" => true, | |
| "from" => true, | |
| "further" => true, | |
| "had" => true, | |
| "hadn't" => true, | |
| "has" => true, | |
| "hasn't" => true, | |
| "have" => true, | |
| "haven't" => true, | |
| "having" => true, | |
| "he" => true, | |
| "he'd" => true, | |
| "he'll" => true, | |
| "he's" => true, | |
| "her" => true, | |
| "here" => true, | |
| "here's" => true, | |
| "hers" => true, | |
| "herself" => true, | |
| "him" => true, | |
| "himself" => true, | |
| "his" => true, | |
| "how" => true, | |
| "how's" => true, | |
| "i" => true, | |
| "i'd" => true, | |
| "i'll" => true, | |
| "i'm" => true, | |
| "i've" => true, | |
| "if" => true, | |
| "in" => true, | |
| "into" => true, | |
| "is" => true, | |
| "isn't" => true, | |
| "it" => true, | |
| "it's" => true, | |
| "its" => true, | |
| "itself" => true, | |
| "let's" => true, | |
| "me" => true, | |
| "more" => true, | |
| "most" => true, | |
| "mustn't" => true, | |
| "my" => true, | |
| "myself" => true, | |
| "no" => true, | |
| "nor" => true, | |
| "not" => true, | |
| "of" => true, | |
| "off" => true, | |
| "on" => true, | |
| "once" => true, | |
| "only" => true, | |
| "or" => true, | |
| "other" => true, | |
| "ought" => true, | |
| "our" => true, | |
| "ours" => true, | |
| "ourselves" => true, | |
| "out" => true, | |
| "over" => true, | |
| "own" => true, | |
| "same" => true, | |
| "shan't" => true, | |
| "she" => true, | |
| "she'd" => true, | |
| "she'll" => true, | |
| "she's" => true, | |
| "should" => true, | |
| "shouldn't" => true, | |
| "so" => true, | |
| "some" => true, | |
| "such" => true, | |
| "than" => true, | |
| "that" => true, | |
| "that's" => true, | |
| "the" => true, | |
| "their" => true, | |
| "theirs" => true, | |
| "them" => true, | |
| "themselves" => true, | |
| "then" => true, | |
| "there" => true, | |
| "there's" => true, | |
| "these" => true, | |
| "they" => true, | |
| "they'd" => true, | |
| "they'll" => true, | |
| "they're" => true, | |
| "they've" => true, | |
| "this" => true, | |
| "those" => true, | |
| "through" => true, | |
| "to" => true, | |
| "too" => true, | |
| "under" => true, | |
| "until" => true, | |
| "up" => true, | |
| "very" => true, | |
| "was" => true, | |
| "wasn't" => true, | |
| "we" => true, | |
| "we'd" => true, | |
| "we'll" => true, | |
| "we're" => true, | |
| "we've" => true, | |
| "were" => true, | |
| "weren't" => true, | |
| "what" => true, | |
| "what's" => true, | |
| "when" => true, | |
| "when's" => true, | |
| "where" => true, | |
| "where's" => true, | |
| "which" => true, | |
| "while" => true, | |
| "who" => true, | |
| "who's" => true, | |
| "whom" => true, | |
| "why" => true, | |
| "why's" => true, | |
| "with" => true, | |
| "won't" => true, | |
| "would" => true, | |
| "wouldn't" => true, | |
| "you" => true, | |
| "you'd" => true, | |
| "you'll" => true, | |
| "you're" => true, | |
| "you've" => true, | |
| "your" => true, | |
| "yours" => true, | |
| "yourself" => true, | |
| "yourselves" => true | |
| }, | |
| false). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment