Last active
March 21, 2021 17:11
-
-
Save delannoy/19325ba2f4fb067efc7fa9ce5439246e to your computer and use it in GitHub Desktop.
Return vega dataset as a pandas.DataFrame. Parse and print available datasets if none are provided.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # [https://github.com/vega/vega-datasets] | |
| import lxml.html, lxml.cssselect, pandas, requests | |
| # [https://lxml.de/] [https://lxml.de/cssselect.html] | |
| def vegaDatasets(dataset:str=None) -> pandas.DataFrame: | |
| def checkExt(dataset:str, ext:str): return dataset.split('.')[-1] == ext | |
| def availableDatasets(url:str): | |
| resp = requests.get(url) | |
| eTree = lxml.html.fromstring(resp.content) | |
| csssel = lxml.cssselect.CSSSelector | |
| print('available datasets:') | |
| return [e.text for e in csssel('td.name a')(eTree) if '..' not in e.text] | |
| url = 'https://cdn.jsdelivr.net/npm/vega-datasets/data/' | |
| if not dataset: return availableDatasets(url) | |
| if checkExt(dataset, 'csv'): return pandas.read_csv(f'{url}{dataset}') | |
| if checkExt(dataset, 'json'): return pandas.read_json(f'{url}{dataset}') | |
| if checkExt(dataset, 'tsv'): return pandas.read_csv(f'{url}{dataset}', sep='\t') | |
| # >>> vegaDatasets() | |
| # available datasets: | |
| # ['7zip.png', 'airports.csv', 'annual-precip.json', 'anscombe.json', 'barley.json', 'birdstrikes.csv', 'budget.json', 'budgets.json', 'burtin.json', 'cars.json', 'co2-concentration.csv', 'countries.json', 'crimea.json', 'disasters.csv', 'driving.json', 'earthquakes.json', 'ffox.png', 'flare.json', 'flare-dependencies.json', 'flights-2k.json', 'flights-3m.csv', 'flights-5k.json', 'flights-10k.json', 'flights-20k.json', 'flights-200k.arrow', 'flights-200k.json', 'flights-airport.csv', 'football.json', 'gapminder.json', 'gapminder-health-income.csv', 'gimp.png', 'github.csv', 'income.json', 'iowa-electricity.csv', 'jobs.json', 'la-riots.csv', 'londonBoroughs.json', 'londonCentroids.json', 'londonTubeLines.json', 'lookup_groups.csv', 'lookup_people.csv', 'miserables.json', 'monarchs.json', 'movies.json', 'normal-2d.json', 'obesity.json', 'ohlc.json', 'penguins.json', 'points.json', 'political-contributions.json', 'population.json', 'population_engineers_hurricanes.csv', 'seattle-weather.csv', 'seattle-weather-hourly-normals.csv', 'sp500.csv', 'sp500-2000.csv', 'stocks.csv', 'udistrict.json', 'unemployment.tsv', 'unemployment-across-industries.json', 'uniform-2d.json', 'us-10m.json', 'us-employment.csv', 'us-state-capitals.json', 'volcano.json', 'weather.csv', 'weather.json', 'wheat.json', 'windvectors.csv', 'world-110m.json', 'zipcodes.csv'] | |
| # >>> vegaDatasets('penguins.json').head(4) | |
| # Species Island Beak Length (mm) Beak Depth (mm) Flipper Length (mm) Body Mass (g) Sex | |
| # 0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE | |
| # 1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE | |
| # 2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE | |
| # 3 Adelie Torgersen NaN NaN NaN NaN None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment