Skip to content

Instantly share code, notes, and snippets.

@delannoy
Last active March 21, 2021 17:11
Show Gist options
  • Select an option

  • Save delannoy/19325ba2f4fb067efc7fa9ce5439246e to your computer and use it in GitHub Desktop.

Select an option

Save delannoy/19325ba2f4fb067efc7fa9ce5439246e to your computer and use it in GitHub Desktop.
Return vega dataset as a pandas.DataFrame. Parse and print available datasets if none are provided.
#!/usr/bin/env python3
# [https://github.com/vega/vega-datasets]
import lxml.html, lxml.cssselect, pandas, requests
# [https://lxml.de/] [https://lxml.de/cssselect.html]
def vegaDatasets(dataset:str=None) -> pandas.DataFrame:
def checkExt(dataset:str, ext:str): return dataset.split('.')[-1] == ext
def availableDatasets(url:str):
resp = requests.get(url)
eTree = lxml.html.fromstring(resp.content)
csssel = lxml.cssselect.CSSSelector
print('available datasets:')
return [e.text for e in csssel('td.name a')(eTree) if '..' not in e.text]
url = 'https://cdn.jsdelivr.net/npm/vega-datasets/data/'
if not dataset: return availableDatasets(url)
if checkExt(dataset, 'csv'): return pandas.read_csv(f'{url}{dataset}')
if checkExt(dataset, 'json'): return pandas.read_json(f'{url}{dataset}')
if checkExt(dataset, 'tsv'): return pandas.read_csv(f'{url}{dataset}', sep='\t')
# >>> vegaDatasets()
# available datasets:
# ['7zip.png', 'airports.csv', 'annual-precip.json', 'anscombe.json', 'barley.json', 'birdstrikes.csv', 'budget.json', 'budgets.json', 'burtin.json', 'cars.json', 'co2-concentration.csv', 'countries.json', 'crimea.json', 'disasters.csv', 'driving.json', 'earthquakes.json', 'ffox.png', 'flare.json', 'flare-dependencies.json', 'flights-2k.json', 'flights-3m.csv', 'flights-5k.json', 'flights-10k.json', 'flights-20k.json', 'flights-200k.arrow', 'flights-200k.json', 'flights-airport.csv', 'football.json', 'gapminder.json', 'gapminder-health-income.csv', 'gimp.png', 'github.csv', 'income.json', 'iowa-electricity.csv', 'jobs.json', 'la-riots.csv', 'londonBoroughs.json', 'londonCentroids.json', 'londonTubeLines.json', 'lookup_groups.csv', 'lookup_people.csv', 'miserables.json', 'monarchs.json', 'movies.json', 'normal-2d.json', 'obesity.json', 'ohlc.json', 'penguins.json', 'points.json', 'political-contributions.json', 'population.json', 'population_engineers_hurricanes.csv', 'seattle-weather.csv', 'seattle-weather-hourly-normals.csv', 'sp500.csv', 'sp500-2000.csv', 'stocks.csv', 'udistrict.json', 'unemployment.tsv', 'unemployment-across-industries.json', 'uniform-2d.json', 'us-10m.json', 'us-employment.csv', 'us-state-capitals.json', 'volcano.json', 'weather.csv', 'weather.json', 'wheat.json', 'windvectors.csv', 'world-110m.json', 'zipcodes.csv']
# >>> vegaDatasets('penguins.json').head(4)
# Species Island Beak Length (mm) Beak Depth (mm) Flipper Length (mm) Body Mass (g) Sex
# 0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
# 1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
# 2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
# 3 Adelie Torgersen NaN NaN NaN NaN None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment