This is a script to convert the csv of crawling at home to parquet
See the result at https://colab.research.google.com/drive/14Hc_fUUOrG9260VzD_XsTxWX7f5cptyL?usp=sharing
This is a script to convert the csv of crawling at home to parquet
See the result at https://colab.research.google.com/drive/14Hc_fUUOrG9260VzD_XsTxWX7f5cptyL?usp=sharing
| ''' | |
| Compute some stats on cah collection | |
| First get the files with: | |
| https://gist.github.com/rom1504/f427b1c82df26c9993daa36fca7f9881 | |
| Then pip install pyspark | |
| Then run this file. It also takes a few minutes | |
| The main thing this script is doing is adding/removing/reordering csv columns and converting to fewer parquet files | |
| The end result is easy to use in spark, pandas or anything else | |
| ''' | |
| from glob import glob | |
| from multiprocessing import Pool | |
| from collections import defaultdict | |
| from pathlib import Path | |
| def f(w): | |
| return open(w, "r").readline().rstrip().split("|") | |
| def main(): | |
| p = Pool(128) | |
| # necessary because the schema changed | |
| print("Retrieving columns of all csv files") | |
| fs = [str(x) for x in Path('/media/hd/cah/drive').glob("**/*.csv")] + [str(x) for x in Path('/media/hd/cah/theeye/output/cah').glob("**/*.csv")] | |
| headers = p.map(f, fs) | |
| all = list(zip(headers,fs)) | |
| print("Grouping files by columns") | |
| d = defaultdict(list) | |
| for cols, path in all: | |
| d[",".join(cols)].append(path) | |
| print("Starting spark session") | |
| from pyspark.sql import SparkSession | |
| from pyspark.sql.functions import lit | |
| # You can open http://localhost:4040 to follow progress on the spark operations | |
| spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate() | |
| ref_cols = ['SAMPLE_ID','URL','TEXT','HEIGHT','WIDTH','LICENSE','NSFW','similarity'] | |
| total = None | |
| print("Reading all collections of csv, removing, adding and reordering columns as needed") | |
| for cols, paths in d.items(): | |
| cols = cols.split(",") | |
| incols = [x for x in cols if x in ref_cols] | |
| print("incols", incols) | |
| w = spark.read.options(delimiter="|", header=True).csv(paths).select(*incols) | |
| addcols = [x for x in ref_cols if x not in cols] | |
| print("addcols", addcols) | |
| for c in addcols: | |
| w = w.withColumn(c, lit("")) | |
| w = w.select(*ref_cols) | |
| if total is None: | |
| total = w | |
| else: | |
| total = total.union(w) | |
| print("Casting columns to the right types") | |
| total = total.withColumn("SAMPLE_ID", total["SAMPLE_ID"].cast("bigint")) | |
| total = total.withColumn("WIDTH", total["WIDTH"].cast("int")) | |
| total = total.withColumn("HEIGHT", total["HEIGHT"].cast("int")) | |
| total = total.withColumn("similarity", total["similarity"].cast("double")) | |
| print("Repartitionning and writing to 16 parquet files to cah_dataframe") | |
| total.repartition(16).write.mode("overwrite").parquet("cah_dataframe") | |
| ok = spark.read.parquet("cah_dataframe") | |
| print("Rereading the parquet and computing some basic stats") | |
| print("Size of collection", ok.count()) | |
| uniques = ok.drop_duplicates(["URL", "TEXT"]) | |
| uniques.repartition(16).write.mode("overwrite").parquet("cah_dataframe_unique") | |
| ok_unique = spark.read.parquet("cah_dataframe_unique") | |
| print("Number of uniques", ok_unique.count()) | |
| main() |
| ''' | |
| Once you computed the parquet files with unique items, | |
| let's compute more stats | |
| ''' | |
| from pyspark.sql import SparkSession | |
| import pyspark.sql.functions as F | |
| def main(): | |
| spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate() | |
| df = spark.read.parquet("cah_dataframe_unique") | |
| df.printSchema() | |
| df.show(truncate=False) | |
| print("width quantiles", df.approxQuantile("WIDTH", [0.1*x for x in range(1,10)], 0.1)) | |
| print("height quantiles", df.approxQuantile("HEIGHT", [0.1*x for x in range(1,10)], 0.1)) | |
| print("similarity quantiles", df.approxQuantile("similarity", [0.1*x for x in range(1,10)], 0.1)) | |
| df = df.withColumn("lentext", F.length("TEXT")) | |
| print("text length quantiles", df.approxQuantile("lentext", [0.1*x for x in range(1,10)], 0.1)) | |
| print("Number of uniques", df.count()) | |
| main() |
| root | |
| |-- SAMPLE_ID: long (nullable = true) |-- URL: string (nullable = true) | |
| |-- TEXT: string (nullable = true) | |
| |-- HEIGHT: integer (nullable = true) | |
| |-- WIDTH: integer (nullable = true) |-- LICENSE: string (nullable = true) | |
| |-- NSFW: string (nullable = true) |-- similarity: double (nullable = true) | |
| +-----------+------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------- | |
| ---------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
| |SAMPLE_ID |URL |TEXT | |
| |HEIGHT|WIDTH|LICENSE|NSFW |similarity | | |
| +-----------+------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------- | |
| ---------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
| ---------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
| |41826002453|http://cdn-s3-3.wanelo.com/product/image/5753773/x200.jpg |Hoop Earrings Beaded Hoop Earrings |200 |200 |? |UNLIKELY|0.3015734851360321 | | |
| |11286064458|http://images.knetbooks.com/images/d/7/232/9780321087232.jpg |Practical Guide to Secondary Social Studies, A |187 |187 |? |UNLIKELY|0.3707329034805298 | | |
| |15923025895|http://static8.depositphotos.com/1231854/999/i/450/dep_9991553-Young-glamorous-blonde-with-shopping-bag-holding-toy-terrier-dogs-holding-dog.jpg|Young glamorous blonde with shopping bag holding toy terrier dogs holding dog — Stock Photo #9991553|398 |600 |? |UNLIKELY|0.3059745728969574 | | |
| |10787043769|https://www.handcraftedmodelships.com/pictures/main/lighthouse-decor-beach-decorations23.jpg |Wooden White Net and Rope Lighthouse 15 |400 |300 |? |UNLIKELY|0.3204681873321533 | | |
| |30762015899|http://media.rightmove.co.uk/11k/10211/41348840/10211_311626A_11626_IMG_16_0000_max_135x100.JPG |3 bed Cottage for sale in Berkeley, Gloucestershire |135 |90 |? |UNLIKELY|0.31845608353614807| | |
| |33202002260|http://g2.img-dpreview.com/EE6A97AE86E14336BE9D29331FA5737B.jpg |Hang Glide over Cowichan |120 |90 |? |UNLIKELY|0.32427337765693665| | |
| |42981003590|http://cdn-s3-3.wanelo.com/product/image/3939089/x200.jpg |ANY Size ANY Colorway x High-Waisted Aztec Frayed Denim Shorts |200 |200 |? |UNLIKELY|0.3527474105358124 | | |
| |17772022270|http://images3.chictopia.com/photos/Maddinka/7896011255/yellow-wholesale-dress-bag-blue-stradivarius-top-cream-mango-skirt.jpg |yellow Wholesale-Dress bag - blue Stradivarius top - cream Mango skirt |300 |450 |? |UNLIKELY|0.3499804437160492 | | |
| |27789015457|http://static6.depositphotos.com/1036080/659/i/110/depositphotos_6598667-Couple-of-amazing-black-dobermans.jpg |Couple of amazing black dobermans - Foto de Stock |110 |110 |? |UNLIKELY|0.3847452998161316 | | |
| |34655009450|http://cdn1.image.tube8.phncdn.com/201103/07/706841/190x143/2.jpg |Army girl gets fucked by ... |190 |143 |? |UNLIKELY|0.3381859362125397 | | |
| |10112032340|http://cdn-s3-3.wanelo.com/product/image/1656336/original.jpg |Elegance Shawl / Scarf with Lacy Edge - leopard- |570 |715 |? |UNLIKELY|0.34083840250968933| | |
| |23205009139|http://i0.wp.com/hypebeast.com/image/2012/09/mastermind-japan-carhartt-2012-fall-winter-capsule-collection-1.jpg?w=570 |Image of mastermind JAPAN x Carhartt 2012 Fall/Winter Capsule Collection |570 |854 |? |UNLIKELY|0.3161979019641876 | | |
| |8161023068 |http://d2d00szk9na1qq.cloudfront.net/Product/36c0458d-8d82-4b0b-b1cb-b880acee1c3d/Images/List_UO-507.jpg |Magnolia Home Fashions Oxford Stripe Charcoal |150 |150 |? |UNLIKELY|0.3259333670139313 | | |
| |4838750426 |http://www.toggle.co.nz/media/catalog/product/cache/1/thumbnail/130x/9df78eab33525d08d6e5fb8d27136e95/k/e/keepcalm_blue.jpg |"Keep Calm" - Blue Canvas |130 |130 | |UNLIKELY|0.33144283294677734| | |
| |3469000780 |http://media.bdaily.s3.amazonaws.com/images/avatars/large/10682.jpg |Thomas Eggar |82 |82 | |UNLIKELY|0.35470208525657654| | |
| |12408032134|http://rlv.zcache.co.uk/ornate_formal_black_white_damask_custom_tie-rfe02c92ee866464db7d5b7c97571d0b5_v9whb_8byvr_216.jpg |ornate formal black white damask custom tie |216 |216 |? |UNLIKELY|0.3525067865848541 | | |
| |24984002927|http://cdn2.newsok.biz/cache/w640-6f653feb73c40138b825518939ac3557.jpg |The old Iten Biscuit Co. is now a U-Haul center. THE OKLAHOMAN ARCHIVES |640 |511 |? |UNLIKELY|0.3311549127101898 | | |
| |12525060020|http://s7d4.scene7.com/is/image/Belk?layer=0&src=1802968_202178317001_A_001_T10L00&layer=comp&$P_PROD$ |Lauren Ralph Lauren Plus Size Drawstring Cotton Cropped Pant |233 |338 |? |UNLIKELY|0.3262045085430145 | | |
| |12231074283|https://www.sportstadion.nl/media/catalog/category/newcastle-united.jpg |Newcastle United |698 |313 |? |UNLIKELY|0.3026922047138214 | | |
| |42445000967|http://media3.onsugar.com/files/upl1/10/104166/39_2008/catgenie.xlarge/i/Upgrade-Your-CatGenie-Power-Flush-System-Free.jpg |Upgrade Your CatGenie with the Power Flush System for Free! |320 |150 |? |UNLIKELY|0.3295552730560303 | | |
| +-----------+------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
| only showing top 20 rows | |
| width quantiles [0.0, 120.0, 151.0, 180.0, 215.0, 270.0, 273.0, 370.0, 39580.0] | |
| height quantiles [0.0, 128.0, 160.0, 184.0, 216.0, 250.0, 300.0, 446.0, 18849.0] | |
| similarity quantiles [0.0, 0.3069250285625458, 0.3145156800746918, 0.32135993242263794, 0.32168570160865784, 0.32813096046447754, 0.339599609375, 0.35535627603530884, 6016.0] | |
| text length quantiles [1.0, 25.0, 33.0, 39.0, 45.0, 50.0, 56.0, 73.0, 61192.0] |