Skip to content

Instantly share code, notes, and snippets.

@koenvo
Last active November 13, 2025 20:52
Show Gist options
  • Select an option

  • Save koenvo/e7d521e2867b710a47a8f1255c2d7894 to your computer and use it in GitHub Desktop.

Select an option

Save koenvo/e7d521e2867b710a47a8f1255c2d7894 to your computer and use it in GitHub Desktop.
import time
# Make sure you have duckdb==0.7.0. Earlier versions might fail with GIL problems ( https://twitter.com/mr_le_fox/status/1620535141675433986 )
import duckdb
import s3fs
from fsspec.implementations.cached import SimpleCacheFileSystem
# Create the s3 file system. This one does not have caching
s3_file_system = s3fs.S3FileSystem(
anon=True,
default_block_size=100 * 1024 * 1024,
client_kwargs={
'region_name': 'us-east-1'
}
)
# Wrap the S3 file system in a caching file system
# Note 1: We use the `SimpleCacheFileSystem` as this cached the entire file instead of per block.
# The regular `CacheFileSystem` (cache in blocks) don't seem to work when same query is performed in from same connection..
# Note 2: Some method calls to the cache will reach the original fs even when an cache item exists.
# In this case this will cause some HEAD requests to s3, which are taking some time. The data itself is cached.
fs = SimpleCacheFileSystem(
fs=s3_file_system,
cache_storage="./tmp/"
)
# Create a new duckdb connection
con = duckdb.connect()
# Register the cached file system. Read more about duckdb fsspec support: https://duckdb.org/docs/guides/python/filesystems.html
con.register_filesystem(fs)
# Run the query the first time
t0 = time.time()
df = con.execute(f'''
select *
from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
limit 10
'''
)
t1 = time.time()
# Same query second time
df = con.execute(f'''
select *
from read_csv_auto('s3://anaconda-public-datasets/gdelt/csv/20150906.export.csv')
limit 10
'''
)
t2 = time.time()
print(f"First run took: {(t1 - t0) * 1000:.1f}ms")
print(f"Second run took: {(t2 - t1) * 1000:.1f}ms")
"""
First run took: 16892.7ms
Second run took: 341.2ms
"""
@majidaldo
Copy link

had to subclass and set protocol='s3' on SimpleCacheFileSystem.

class MyFS(SimpleCacheFileSystem):
  protocol = 's3' # can change to make sure you're accessing this fs instead of duckdb builtin

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment