Skip to content

Instantly share code, notes, and snippets.

@fr1ll
Created October 6, 2020 02:41
Show Gist options
  • Select an option

  • Save fr1ll/7673abe5b3f1b640bc6c0e0fe2feaf8c to your computer and use it in GitHub Desktop.

Select an option

Save fr1ll/7673abe5b3f1b640bc6c0e0fe2feaf8c to your computer and use it in GitHub Desktop.
Compare performance of 'map' versus 'replace' for pandas series
import pandas as pd
import numpy as np
import timeit
def random_series(count=10000):
vals = np.random.randint(0,20*count,count)
return pd.Series(data=vals, name="RANDVALS")
def unique_map(s: pd.Series):
as_unique = s.unique()
return {dval: i for i, dval in enumerate(as_unique)}
funcs = ["s.replace(d_map)",
"s.map(d_map)",
"s.map(d_map.get)"]
counts = [10, 100, 1000, 10000, 100000]
def time_some_counts(func, counts):
times = []
for c in counts:
# create data with lenght set by c
s = random_series(c)
d_map = unique_map(s)
# tricky way to pass data to timeit
myGlobals = globals()
myGlobals.update({'s': s, 'd_map': d_map})
#time the functions
new_times = timeit.Timer(func, globals=myGlobals).autorange()
new_times = [func, c, new_times[0], new_times[1], new_times[1]/new_times[0]]
times.append(new_times)
return times
columns = ["func", "length_of_series", "runs", "total_runtime", "mean_runtime"]
df = pd.DataFrame(columns=columns)
for f in funcs:
times = time_some_counts(f, counts)
df = df.append(pd.DataFrame(times, columns=columns))
import altair as alt
chart = alt.Chart(df).mark_circle(size=60).encode(
x=alt.X('length_of_series', scale=alt.Scale(type="log")),
y=alt.Y('mean_runtime', scale=alt.Scale(type="log")),
color='func',
)
chart + chart.transform_regression('length_of_series', 'mean_runtime', groupby=["func"], method="pow").mark_line()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment