tmountain · October 31, 2012 14:09
diff --git a/.gitignore b/.gitignore
 target/
diff --git a/README.md b/README.md
diff --git a/commoncrawl.clj b/commoncrawl.clj
 (ns ccooo.commoncrawl
  (:require [clojure.string :as s]
            [cascalog.ops :as c])
  (:use cascalog.api
        [cascalog.more-taps :only (hfs-wrtseqfile)])
  (:import [org.apache.hadoop.io Text]
           [org.apache.commons.httpclient URI]))

 ;; Discussion about valid segments:
 ;; https://groups.google.com/forum/#!topic/common-crawl/QYTmnttZZyo/discussion
 ;;
 ;; I was led to this discussion because when naively globbing accross
 ;; all segments, my hadoop job failed with: 2012-10-10 21:44:09,895
 ;; INFO org.apache.hadoop.io.retry.RetryInvocationHandler
 ;; (pool-1-thread-1): Exception while invoking retrieveMetadata of
 ;; class org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.
 ;; Not retrying.Status Code: 403, AWS Service: Amazon S3, AWS Request
 ;; ID: ..., AWS Error Code: null, AWS Error Message: Forbidden, S3
 ;; Extended Request ID: ...
 ;;
 ;; I found this discussion related to such errors:
 ;; https://groups.google.com/d/topic/common-crawl/e07aej71GLU/discussion

 ;; Obtained from:
 ;; https://s3.amazonaws.com/aws-publicdatasets/common-crawl/parse-output/valid_segments.txt
 (def valid-segments [1346823845675 1346823846036 1346823846039 1346823846110 1346823846125 1346823846150 1346823846176 1346876860445 1346876860454 1346876860467 1346876860493 1346876860565 1346876860567 1346876860596 1346876860609 1346876860611 1346876860614 1346876860648 1346876860765 1346876860767 1346876860774 1346876860777 1346876860779 1346876860782 1346876860786 1346876860789 1346876860791 1346876860795 1346876860798 1346876860804 1346876860807 1346876860817 1346876860819 1346876860828 1346876860835 1346876860838 1346876860840 1346876860843 1346876860877 1346981172137 1346981172142 1346981172155 1346981172184 1346981172186 1346981172229 1346981172231 1346981172234 1346981172239 1346981172250 1346981172253 1346981172255 1346981172258 1346981172261 1346981172264 1346981172266 1346981172268])

 ;; A set of valid TLDs obtained from:
 ;; http://data.iana.org/TLD/tlds-alpha-by-domain.txt
 (def valid-tld #{"ac" "ad" "ae" "aero" "af" "ag" "ai" "al" "am" "an" "ao" "aq" "ar" "arpa" "as" "asia" "at" "au" "aw" "ax" "az" "ba" "bb" "bd" "be" "bf" "bg" "bh" "bi" "biz" "bj" "bm" "bn" "bo" "br" "bs" "bt" "bv" "bw" "by" "bz" "ca" "cat" "cc" "cd" "cf" "cg" "ch" "ci" "ck" "cl" "cm" "cn" "co" "com" "coop" "cr" "cu" "cv" "cw" "cx" "cy" "cz" "de" "dj" "dk" "dm" "do" "dz" "ec" "edu" "ee" "eg" "er" "es" "et" "eu" "fi" "fj" "fk" "fm" "fo" "fr" "ga" "gb" "gd" "ge" "gf" "gg" "gh" "gi" "gl" "gm" "gn" "gov" "gp" "gq" "gr" "gs" "gt" "gu" "gw" "gy" "hk" "hm" "hn" "hr" "ht" "hu" "id" "ie" "il" "im" "in" "info" "int" "io" "iq" "ir" "is" "it" "je" "jm" "jo" "jobs" "jp" "ke" "kg" "kh" "ki" "km" "kn" "kp" "kr" "kw" "ky" "kz" "la" "lb" "lc" "li" "lk" "lr" "ls" "lt" "lu" "lv" "ly" "ma" "mc" "md" "me" "mg" "mh" "mil" "mk" "ml" "mm" "mn" "mo" "mobi" "mp" "mq" "mr" "ms" "mt" "mu" "museum" "mv" "mw" "mx" "my" "mz" "na" "name" "nc" "ne" "net" "nf" "ng" "ni" "nl" "no" "np" "nr" "nu" "nz" "om" "org" "pa" "pe" "pf" "pg" "ph" "pk" "pl" "pm" "pn" "post" "pr" "pro" "ps" "pt" "pw" "py" "qa" "re" "ro" "rs" "ru" "rw" "sa" "sb" "sc" "sd" "se" "sg" "sh" "si" "sj" "sk" "sl" "sm" "sn" "so" "sr" "st" "su" "sv" "sx" "sy" "sz" "tc" "td" "tel" "tf" "tg" "th" "tj" "tk" "tl" "tm" "tn" "to" "tp" "tr" "travel" "tt" "tv" "tw" "tz" "ua" "ug" "uk" "us" "uy" "uz" "va" "vc" "ve" "vg" "vi" "vn" "vu" "wf" "ws" "xn--0zwm56d" "xn--11b5bs3a9aj6g" "xn--3e0b707e" "xn--45brj9c" "xn--80akhbyknj4f" "xn--80ao21a" "xn--90a3ac" "xn--9t4b11yi5a" "xn--clchc0ea0b2g2a9gcd" "xn--deba0ad" "xn--fiqs8s" "xn--fiqz9s" "xn--fpcrj9c3d" "xn--fzc2c9e2c" "xn--g6w251d" "xn--gecrj9c" "xn--h2brj9c" "xn--hgbk6aj7f53bba" "xn--hlcj6aya9esc7a" "xn--j6w193g" "xn--jxalpdlp" "xn--kgbechtv" "xn--kprw13d" "xn--kpry57d" "xn--lgbbat1ad8j" "xn--mgb9awbf" "xn--mgbaam7a8h" "xn--mgbayh7gpa" "xn--mgbbh1a71e" "xn--mgbc0a9azcg" "xn--mgberp4a5d4ar" "xn--mgbx4cd0ab" "xn--o3cw4h" "xn--ogbpf8fl" "xn--p1ai" "xn--pgbs0dh" "xn--s9brj9c" "xn--wgbh1c" "xn--wgbl6a" "xn--xkc2al3hye2a" "xn--xkc2dl3a5ee0h" "xn--yfro4i67o" "xn--ygbi2ammx" "xn--zckzah" "xxx" "ye" "yt" "za" "zm" "zw"})

 (defn ^String metadata-path
  "Produces the glob of paths to metadata files organized
   according to CommonCrawl docs: http://tinyurl.com/common-crawl-about-dataset
   Here's an example (assuming valid segments are 1, 2 and 3):
   (metadata-path 's3://path/to/commoncrawl/segments')
   => 's3://path/to/commoncrawl/segments/{1,2,3}/metadata*'"
  [prefix]
  (->> valid-segments
       (s/join ",")
       (format "%s/{%s}/metadata*" prefix)))

 (defn ^String parse-hostname
  "Extracts the hostname from the specified URL."
  [^Text url]
  (-> url str URI. .getHost (or "")))

 (defn ^String parse-tld
  "Returns a piece of the URL's hostname after the last '.'
   Note: this may or not be an actual TLD, we'll validate
   eslewhere."
  [^Text url]
  (-> url parse-hostname (s/split #"\.") peek))

 (defn query-tlds
  "Counts site URLs from the metadata corpus grouped by TLD of each URL."
  [metadata-tap trap-tap]
  (<- [?tld ?n]
      (metadata-tap :> ?url _)
      (parse-tld ?url :> ?tld)
      (valid-tld ?tld)
      (c/count :> ?n)
      (:trap trap-tap)))

 (defmain TldQueryExe
  "Defines 'main' method that will execute our query."
  [prefix output-dir]
  (let [metadata-tap (hfs-wrtseqfile (metadata-path prefix)
                                     Text Text
                                     :outfields ["key" "value"])
        trap-tap (hfs-seqfile (str output-dir ".trap"))]
    (?- (hfs-textline output-dir)
        (query-tlds metadata-tap trap-tap))))
diff --git a/Makefile b/Makefile
 # Collect a list of all Clojure source code file names.
 sources := $(shell find . -type f -name "*.clj" -not -wholename "*/target/*")

 target/ccooo-standalone.jar: $(sources)
 	lein do compile, uberjar

 .PHONY: test
 test:
 	lein do compile, midje
diff --git a/project.clj b/project.clj
 (defproject org.clojars.paxan/ccooo "0.0.1-SNAPSHOT"
  :description "Common Crawl One Oh One (CCOOO)"
  :jar-name "ccooo.jar"
  :uberjar-name "ccooo-standalone.jar"
  :dependencies [[cascalog "1.10.0"]
                 [cascalog-more-taps "0.3.0"]
                 [org.clojure/data.json "0.1.3"]
                 [commons-httpclient "3.0.1"]]
  :profiles {:dev
             {:dependencies [[org.apache.hadoop/hadoop-core "1.0.3"]
                             [midje "1.4.0"]
                             [midje-cascalog "0.4.0" :exclusions [org.clojure/clojure]]]}}
  :aot [ccooo.commoncrawl]
  :jvm-opts ["-Xmx1024m" "-server"])
diff --git a/results.txt b/results.txt
 com	2673598481
 org	304265520
 net	267384802
 de	205656928
 uk	168846878
 pl	82716496
 ru	76587934
 info	73012083
 nl	63865755
 fr	57607765
 it	57169697
 jp	51352485
 au	47644772
 br	38397470
 ca	34885440
 edu	33770216
 cz	31919454
 es	30234103
 ro	24314778
 ch	23773429
 se	23523429
 cn	22098310
 eu	21176891
 us	20979439
 biz	19670384
 ua	18765163
 be	17547133
 dk	17237868
 at	16700746
 hu	15915580
 gov	13441484
 tv	11257982
 nz	10990233
 no	10720914
 ar	10643506
 in	10105676
 fi	8057061
 za	7807622
 tk	6972693
 sk	6711391
 mx	6649092
 tw	5633140
 ie	5568317
 tr	5339761
 cc	5296120
 gr	5269706
 pt	4917728
 kr	4664994
 me	4466363
 cl	4268607
 lt	4211638
 ws	3839074
 il	3809111
 vn	3491126
 nu	3198196
 fm	3113789
 ir	3005073
 hr	2972707
 id	2915921
 lv	2644739
 si	2632041
 sg	2517856
 my	2480529
 ee	2289856
 hk	2223043
 cat	2222285
 co	2168680
 name	1956669
 ph	1806710
 bg	1690155
 rs	1687111
 is	1623369
 by	1551484
 mobi	1480094
 pe	1457104
 su	1376915
 to	1322537
 th	1200424
 kz	966996
 lu	946862
 pk	788239
 ve	755158
 md	752997
 am	742657
 uy	740998
 travel	713272
 int	593946
 ae	533864
 asia	499624
 ec	488155
 ba	472716
 bz	465933
 sa	459688
 ma	414386
 mil	393451
 cr	391149
 cx	375813
 pro	354241
 li	346502
 eg	345288
 la	336107
 ge	334942
 do	333321
 np	329747
 st	326570
 mk	322710
 cd	319637
 lk	317232
 coop	288483
 uz	285619
 az	283443
 im	258522
 py	249463
 io	234936
 cu	234508
 mn	227103
 ly	221035
 vu	214927
 hn	206804
 ag	206739
 ms	205234
 gt	201512
 ni	184440
 bo	179374
 tc	177118
 cy	173733
 pa	172407
 ke	172375
 tl	169478
 ps	159170
 aero	140006
 mu	139950
 ac	138494
 bd	137211
 jobs	136030
 mt	134373
 lb	127906
 tel	127160
 kg	124102
 sv	120203
 tn	113409
 gd	108777
 gs	107345
 tt	98235
 jo	96747
 as	94761
 ng	94631
 re	93903
 vg	92082
 sc	90877
 vc	90865
 gg	89063
 mz	82784
 dj	79888
 sh	79521
 nr	79145
 ug	78194
 pr	71365
 al	70276
 tm	70163
 museum	69595
 sm	69013
 jm	64123
 gl	62177
 va	61439
 mo	60957
 dz	59240
 tz	57658
 gm	56658
 ad	55421
 kh	54710
 kw	53366
 bm	52620
 fo	51881
 na	51738
 nf	47086
 qa	47069
 om	45477
 nc	45195
 ky	44865
 tj	44394
 fj	43361
 mp	43108
 mc	42636
 cm	42248
 pg	41567
 zw	41064
 af	40424
 pf	39768
 sn	35878
 lc	34734
 mv	34301
 sy	34067
 bw	32829
 ax	32473
 gh	32253
 je	31861
 bn	30730
 gp	30691
 tf	29246
 mg	28988
 bs	26639
 bh	25251
 bt	24050
 bb	24012
 ao	23557
 vi	20809
 ci	20759
 ht	20093
 et	19840
 gi	19441
 zm	18859
 ne	18770
 ai	18131
 an	16804
 sd	16648
 bf	15792
 rw	15670
 sl	15336
 gy	15196
 xn--p1ai	15036
 mr	14426
 ki	14247
 sr	13590
 mw	13021
 iq	12683
 cv	11521
 ls	11458
 mm	11282
 pn	10651
 ck	9083
 hm	9032
 ye	8738
 bi	7941
 dm	7384
 ml	7265
 sz	7048
 aw	6521
 bj	5822
 sb	5494
 xxx	5442
 tg	4883
 gf	4449
 aq	3648
 so	3638
 mq	2755
 lr	2686
 ga	2641
 fk	2249
 kn	2137
 cg	1778
 yt	1431
 gn	1330
 cf	1112
 td	1041
 er	1009
 gu	906
 gq	564
 km	469
 pw	406
 xn--fiqs8s	297
 cw	170
 gw	83
 arpa	77
 pm	51
 mh	41
 tp	37
 wf	9
 xn--o3cw4h	9
 bv	8
 post	5
 xn--90a3ac	3
 xn--j6w193g	3
 gb	2
 sx	1
 xn--mgbaam7a8h	1
diff --git a/test_ccooo.clj b/test_ccooo.clj
 (ns ccooo.test-ccooo
  (:use [midje sweet cascalog]
        ccooo.commoncrawl
        cascalog.api
        cascalog.testing)
  (:import [org.apache.hadoop.io Text]))

 (fact "Hostname parsing"
      (-> "http://zorg.me/give/me/the/stones?q=foo bar" Text. parse-hostname) => "zorg.me"
      (-> "" Text. parse-hostname) => ""
      (-> "()" Text. parse-hostname) => ""
      (-> "abc" Text. parse-hostname) => ""
      (-> "what://wsx/qaz?q=a%20b" Text. parse-hostname) => "wsx")

 (fact "TLD parsing"
      (-> "http://foo.bar.baz/1/2/3.txt" Text. parse-tld) => "baz"
      (-> "https://oh.really.xn--fiqs8s:443/yeah" Text. parse-tld) => "xn--fiqs8s")

 (with-expected-sink-sets [nothing-trapped []]
  (let [metadata-tap [["http://abc.example.com/123" "{}"]
                      ["http://foo.bogus/" "{}"] ;; this one should be skipped!
                      ["http://xyz.example.com/890" "{}"]
                      ["https://wow.co.uk/hello" "{}"]]]
   (fact?- "the TLD query works all right"
           [["com" 2]
            ["uk" 1]]
           (query-tlds metadata-tap nothing-trapped))))
	(ns ccooo.commoncrawl
	(:require [clojure.string :as s]
	[cascalog.ops :as c])
	(:use cascalog.api
	[cascalog.more-taps :only (hfs-wrtseqfile)])
	(:import [org.apache.hadoop.io Text]
	[org.apache.commons.httpclient URI]))

	;; Discussion about valid segments:
	;; https://groups.google.com/forum/#!topic/common-crawl/QYTmnttZZyo/discussion
	;;
	;; I was led to this discussion because when naively globbing accross
	;; all segments, my hadoop job failed with: 2012-10-10 21:44:09,895
	;; INFO org.apache.hadoop.io.retry.RetryInvocationHandler
	;; (pool-1-thread-1): Exception while invoking retrieveMetadata of
	;; class org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.
	;; Not retrying.Status Code: 403, AWS Service: Amazon S3, AWS Request
	;; ID: ..., AWS Error Code: null, AWS Error Message: Forbidden, S3
	;; Extended Request ID: ...
	;;
	;; I found this discussion related to such errors:
	;; https://groups.google.com/d/topic/common-crawl/e07aej71GLU/discussion

	;; Obtained from:
	;; https://s3.amazonaws.com/aws-publicdatasets/common-crawl/parse-output/valid_segments.txt
	(def valid-segments [1346823845675 1346823846036 1346823846039 1346823846110 1346823846125 1346823846150 1346823846176 1346876860445 1346876860454 1346876860467 1346876860493 1346876860565 1346876860567 1346876860596 1346876860609 1346876860611 1346876860614 1346876860648 1346876860765 1346876860767 1346876860774 1346876860777 1346876860779 1346876860782 1346876860786 1346876860789 1346876860791 1346876860795 1346876860798 1346876860804 1346876860807 1346876860817 1346876860819 1346876860828 1346876860835 1346876860838 1346876860840 1346876860843 1346876860877 1346981172137 1346981172142 1346981172155 1346981172184 1346981172186 1346981172229 1346981172231 1346981172234 1346981172239 1346981172250 1346981172253 1346981172255 1346981172258 1346981172261 1346981172264 1346981172266 1346981172268])

	;; A set of valid TLDs obtained from:
	;; http://data.iana.org/TLD/tlds-alpha-by-domain.txt
	(def valid-tld #{"ac" "ad" "ae" "aero" "af" "ag" "ai" "al" "am" "an" "ao" "aq" "ar" "arpa" "as" "asia" "at" "au" "aw" "ax" "az" "ba" "bb" "bd" "be" "bf" "bg" "bh" "bi" "biz" "bj" "bm" "bn" "bo" "br" "bs" "bt" "bv" "bw" "by" "bz" "ca" "cat" "cc" "cd" "cf" "cg" "ch" "ci" "ck" "cl" "cm" "cn" "co" "com" "coop" "cr" "cu" "cv" "cw" "cx" "cy" "cz" "de" "dj" "dk" "dm" "do" "dz" "ec" "edu" "ee" "eg" "er" "es" "et" "eu" "fi" "fj" "fk" "fm" "fo" "fr" "ga" "gb" "gd" "ge" "gf" "gg" "gh" "gi" "gl" "gm" "gn" "gov" "gp" "gq" "gr" "gs" "gt" "gu" "gw" "gy" "hk" "hm" "hn" "hr" "ht" "hu" "id" "ie" "il" "im" "in" "info" "int" "io" "iq" "ir" "is" "it" "je" "jm" "jo" "jobs" "jp" "ke" "kg" "kh" "ki" "km" "kn" "kp" "kr" "kw" "ky" "kz" "la" "lb" "lc" "li" "lk" "lr" "ls" "lt" "lu" "lv" "ly" "ma" "mc" "md" "me" "mg" "mh" "mil" "mk" "ml" "mm" "mn" "mo" "mobi" "mp" "mq" "mr" "ms" "mt" "mu" "museum" "mv" "mw" "mx" "my" "mz" "na" "name" "nc" "ne" "net" "nf" "ng" "ni" "nl" "no" "np" "nr" "nu" "nz" "om" "org" "pa" "pe" "pf" "pg" "ph" "pk" "pl" "pm" "pn" "post" "pr" "pro" "ps" "pt" "pw" "py" "qa" "re" "ro" "rs" "ru" "rw" "sa" "sb" "sc" "sd" "se" "sg" "sh" "si" "sj" "sk" "sl" "sm" "sn" "so" "sr" "st" "su" "sv" "sx" "sy" "sz" "tc" "td" "tel" "tf" "tg" "th" "tj" "tk" "tl" "tm" "tn" "to" "tp" "tr" "travel" "tt" "tv" "tw" "tz" "ua" "ug" "uk" "us" "uy" "uz" "va" "vc" "ve" "vg" "vi" "vn" "vu" "wf" "ws" "xn--0zwm56d" "xn--11b5bs3a9aj6g" "xn--3e0b707e" "xn--45brj9c" "xn--80akhbyknj4f" "xn--80ao21a" "xn--90a3ac" "xn--9t4b11yi5a" "xn--clchc0ea0b2g2a9gcd" "xn--deba0ad" "xn--fiqs8s" "xn--fiqz9s" "xn--fpcrj9c3d" "xn--fzc2c9e2c" "xn--g6w251d" "xn--gecrj9c" "xn--h2brj9c" "xn--hgbk6aj7f53bba" "xn--hlcj6aya9esc7a" "xn--j6w193g" "xn--jxalpdlp" "xn--kgbechtv" "xn--kprw13d" "xn--kpry57d" "xn--lgbbat1ad8j" "xn--mgb9awbf" "xn--mgbaam7a8h" "xn--mgbayh7gpa" "xn--mgbbh1a71e" "xn--mgbc0a9azcg" "xn--mgberp4a5d4ar" "xn--mgbx4cd0ab" "xn--o3cw4h" "xn--ogbpf8fl" "xn--p1ai" "xn--pgbs0dh" "xn--s9brj9c" "xn--wgbh1c" "xn--wgbl6a" "xn--xkc2al3hye2a" "xn--xkc2dl3a5ee0h" "xn--yfro4i67o" "xn--ygbi2ammx" "xn--zckzah" "xxx" "ye" "yt" "za" "zm" "zw"})

	(defn ^String metadata-path
	"Produces the glob of paths to metadata files organized
	according to CommonCrawl docs: http://tinyurl.com/common-crawl-about-dataset
	Here's an example (assuming valid segments are 1, 2 and 3):
	(metadata-path 's3://path/to/commoncrawl/segments')
	=> 's3://path/to/commoncrawl/segments/{1,2,3}/metadata*'"
	[prefix]
	(->> valid-segments
	(s/join ",")
	(format "%s/{%s}/metadata*" prefix)))

	(defn ^String parse-hostname
	"Extracts the hostname from the specified URL."
	[^Text url]
	(-> url str URI. .getHost (or "")))

	(defn ^String parse-tld
	"Returns a piece of the URL's hostname after the last '.'
	Note: this may or not be an actual TLD, we'll validate
	eslewhere."
	[^Text url]
	(-> url parse-hostname (s/split #"\.") peek))

	(defn query-tlds
	"Counts site URLs from the metadata corpus grouped by TLD of each URL."
	[metadata-tap trap-tap]
	(<- [?tld ?n]
	(metadata-tap :> ?url _)
	(parse-tld ?url :> ?tld)
	(valid-tld ?tld)
	(c/count :> ?n)
	(:trap trap-tap)))

	(defmain TldQueryExe
	"Defines 'main' method that will execute our query."
	[prefix output-dir]
	(let [metadata-tap (hfs-wrtseqfile (metadata-path prefix)
	Text Text
	:outfields ["key" "value"])
	trap-tap (hfs-seqfile (str output-dir ".trap"))]
	(?- (hfs-textline output-dir)
	(query-tlds metadata-tap trap-tap))))
	# Collect a list of all Clojure source code file names.
	sources := $(shell find . -type f -name ".clj" -not -wholename "/target/*")

	target/ccooo-standalone.jar: $(sources)
	lein do compile, uberjar

	.PHONY: test
	test:
	lein do compile, midje
	(defproject org.clojars.paxan/ccooo "0.0.1-SNAPSHOT"
	:description "Common Crawl One Oh One (CCOOO)"
	:jar-name "ccooo.jar"
	:uberjar-name "ccooo-standalone.jar"
	:dependencies [[cascalog "1.10.0"]
	[cascalog-more-taps "0.3.0"]
	[org.clojure/data.json "0.1.3"]
	[commons-httpclient "3.0.1"]]
	:profiles {:dev
	{:dependencies [[org.apache.hadoop/hadoop-core "1.0.3"]
	[midje "1.4.0"]
	[midje-cascalog "0.4.0" :exclusions [org.clojure/clojure]]]}}
	:aot [ccooo.commoncrawl]
	:jvm-opts ["-Xmx1024m" "-server"])
	com 2673598481
	org 304265520
	net 267384802
	de 205656928
	uk 168846878
	pl 82716496
	ru 76587934
	info 73012083
	nl 63865755
	fr 57607765
	it 57169697
	jp 51352485
	au 47644772
	br 38397470
	ca 34885440
	edu 33770216
	cz 31919454
	es 30234103
	ro 24314778
	ch 23773429
	se 23523429
	cn 22098310
	eu 21176891
	us 20979439
	biz 19670384
	ua 18765163
	be 17547133
	dk 17237868
	at 16700746
	hu 15915580
	gov 13441484
	tv 11257982
	nz 10990233
	no 10720914
	ar 10643506
	in 10105676
	fi 8057061
	za 7807622
	tk 6972693
	sk 6711391
	mx 6649092
	tw 5633140
	ie 5568317
	tr 5339761
	cc 5296120
	gr 5269706
	pt 4917728
	kr 4664994
	me 4466363
	cl 4268607
	lt 4211638
	ws 3839074
	il 3809111
	vn 3491126
	nu 3198196
	fm 3113789
	ir 3005073
	hr 2972707
	id 2915921
	lv 2644739
	si 2632041
	sg 2517856
	my 2480529
	ee 2289856
	hk 2223043
	cat 2222285
	co 2168680
	name 1956669
	ph 1806710
	bg 1690155
	rs 1687111
	is 1623369
	by 1551484
	mobi 1480094
	pe 1457104
	su 1376915
	to 1322537
	th 1200424
	kz 966996
	lu 946862
	pk 788239
	ve 755158
	md 752997
	am 742657
	uy 740998
	travel 713272
	int 593946
	ae 533864
	asia 499624
	ec 488155
	ba 472716
	bz 465933
	sa 459688
	ma 414386
	mil 393451
	cr 391149
	cx 375813
	pro 354241
	li 346502
	eg 345288
	la 336107
	ge 334942
	do 333321
	np 329747
	st 326570
	mk 322710
	cd 319637
	lk 317232
	coop 288483
	uz 285619
	az 283443
	im 258522
	py 249463
	io 234936
	cu 234508
	mn 227103
	ly 221035
	vu 214927
	hn 206804
	ag 206739
	ms 205234
	gt 201512
	ni 184440
	bo 179374
	tc 177118
	cy 173733
	pa 172407
	ke 172375
	tl 169478
	ps 159170
	aero 140006
	mu 139950
	ac 138494
	bd 137211
	jobs 136030
	mt 134373
	lb 127906
	tel 127160
	kg 124102
	sv 120203
	tn 113409
	gd 108777
	gs 107345
	tt 98235
	jo 96747
	as 94761
	ng 94631
	re 93903
	vg 92082
	sc 90877
	vc 90865
	gg 89063
	mz 82784
	dj 79888
	sh 79521
	nr 79145
	ug 78194
	pr 71365
	al 70276
	tm 70163
	museum 69595
	sm 69013
	jm 64123
	gl 62177
	va 61439
	mo 60957
	dz 59240
	tz 57658
	gm 56658
	ad 55421
	kh 54710
	kw 53366
	bm 52620
	fo 51881
	na 51738
	nf 47086
	qa 47069
	om 45477
	nc 45195
	ky 44865
	tj 44394
	fj 43361
	mp 43108
	mc 42636
	cm 42248
	pg 41567
	zw 41064
	af 40424
	pf 39768
	sn 35878
	lc 34734
	mv 34301
	sy 34067
	bw 32829
	ax 32473
	gh 32253
	je 31861
	bn 30730
	gp 30691
	tf 29246
	mg 28988
	bs 26639
	bh 25251
	bt 24050
	bb 24012
	ao 23557
	vi 20809
	ci 20759
	ht 20093
	et 19840
	gi 19441
	zm 18859
	ne 18770
	ai 18131
	an 16804
	sd 16648
	bf 15792
	rw 15670
	sl 15336
	gy 15196
	xn--p1ai 15036
	mr 14426
	ki 14247
	sr 13590
	mw 13021
	iq 12683
	cv 11521
	ls 11458
	mm 11282
	pn 10651
	ck 9083
	hm 9032
	ye 8738
	bi 7941
	dm 7384
	ml 7265
	sz 7048
	aw 6521
	bj 5822
	sb 5494
	xxx 5442
	tg 4883
	gf 4449
	aq 3648
	so 3638
	mq 2755
	lr 2686
	ga 2641
	fk 2249
	kn 2137
	cg 1778
	yt 1431
	gn 1330
	cf 1112
	td 1041
	er 1009
	gu 906
	gq 564
	km 469
	pw 406
	xn--fiqs8s 297
	cw 170
	gw 83
	arpa 77
	pm 51
	mh 41
	tp 37
	wf 9
	xn--o3cw4h 9
	bv 8
	post 5
	xn--90a3ac 3
	xn--j6w193g 3
	gb 2
	sx 1
	xn--mgbaam7a8h 1
	(ns ccooo.test-ccooo
	(:use [midje sweet cascalog]
	ccooo.commoncrawl
	cascalog.api
	cascalog.testing)
	(:import [org.apache.hadoop.io Text]))

	(fact "Hostname parsing"
	(-> "http://zorg.me/give/me/the/stones?q=foo bar" Text. parse-hostname) => "zorg.me"
	(-> "" Text. parse-hostname) => ""
	(-> "()" Text. parse-hostname) => ""
	(-> "abc" Text. parse-hostname) => ""
	(-> "what://wsx/qaz?q=a%20b" Text. parse-hostname) => "wsx")

	(fact "TLD parsing"
	(-> "http://foo.bar.baz/1/2/3.txt" Text. parse-tld) => "baz"
	(-> "https://oh.really.xn--fiqs8s:443/yeah" Text. parse-tld) => "xn--fiqs8s")

	(with-expected-sink-sets [nothing-trapped []]
	(let [metadata-tap [["http://abc.example.com/123" "{}"]
	["http://foo.bogus/" "{}"] ;; this one should be skipped!
	["http://xyz.example.com/890" "{}"]
	["https://wow.co.uk/hello" "{}"]]]
	(fact?- "the TLD query works all right"
	[["com" 2]
	["uk" 1]]
	(query-tlds metadata-tap nothing-trapped))))