Created
December 19, 2020 21:16
-
-
Save vaclavdekanovsky/484e12d2052548f7fdb4b38cd38631e8 to your computer and use it in GitHub Desktop.
Julia CSV reader's type and types parameters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Type and types parameters of the Julia CSV parser\n", | |
| "Written in [Julia](https://julialang.org/). See [CSV.jl](https://csv.juliadata.org/stable/) and [DataFrames.jl](https://dataframes.juliadata.org/stable/) for more details" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "using CSV, DataFrames" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "All examples are based on string input, which is passed to Julia's CSV reader through `IOBuffer`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "\"c1|c2|c3|c4\\n\\\"1\\\"|2|c|1.5\\n\\\"C|D\\\"|16|x|2.33\\n\"" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data = \"\"\"c1|c2|c3|c4\n", | |
| "\"1\"|2|c|1.5\n", | |
| "\"C|D\"|16|x|2.33\n", | |
| "\"\"\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "You can set the same type for all columns using `type` parameter, e.g. string" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>String</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|cccc}\n", | |
| "\t& c1 & c2 & c3 & c4\\\\\n", | |
| "\t\\hline\n", | |
| "\t& String & String & String & String\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
| "\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "2×4 DataFrame\n", | |
| "│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
| "│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │\n", | |
| "├─────┼────────┼────────┼────────┼────────┤\n", | |
| "│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
| "│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# type turns all the columns to the same type\n", | |
| "CSV.read(IOBuffer(data), DataFrame; type=String)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Or specify type for each or just some columns using a Dict. If the data cannot be parsed to the type, it's turned to `missing` type, equivalent of pandas's `Nan`." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "┌ Warning: thread = 1 warning: error parsing Int64 around row = 2, col = 4: \"1.5\n", | |
| "│ \", error=INVALID: OK | NEWLINE | INVALID_DELIMITER \n", | |
| "└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n", | |
| "┌ Warning: thread = 1 warning: error parsing Int64 around row = 3, col = 4: \"2.33\n", | |
| "│ \", error=INVALID: OK | NEWLINE | EOF | INVALID_DELIMITER \n", | |
| "└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CSV.Row:\n", | |
| " :c1 \"1\"\n", | |
| " :c2 \"2\"\n", | |
| " :c3 \"c\"\n", | |
| " :c4 missing\n", | |
| "CSV.Row:\n", | |
| " :c1 \"C|D\"\n", | |
| " :c2 \"16\"\n", | |
| " :c3 \"x\"\n", | |
| " :c4 missing\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for r in CSV.File(IOBuffer(data), types=Dict(:c2=>String, :c4=>Int64))\n", | |
| " println(r)\n", | |
| "end" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "┌ Warning: thread = 1 warning: error parsing Int64 around row = 2, col = 4: \"1.5\n", | |
| "│ \", error=INVALID: OK | NEWLINE | INVALID_DELIMITER \n", | |
| "└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n", | |
| "┌ Warning: thread = 1 warning: error parsing Int64 around row = 3, col = 4: \"2.33\n", | |
| "│ \", error=INVALID: OK | NEWLINE | EOF | INVALID_DELIMITER \n", | |
| "└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>Int64?</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td><em>missing</em></td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td><em>missing</em></td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|cccc}\n", | |
| "\t& c1 & c2 & c3 & c4\\\\\n", | |
| "\t\\hline\n", | |
| "\t& String & String & String & Int64?\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2 & c & \\emph{missing} \\\\\n", | |
| "\t2 & C|D & 16 & x & \\emph{missing} \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "2×4 DataFrame\n", | |
| "│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
| "│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64?\u001b[39m │\n", | |
| "├─────┼────────┼────────┼────────┼─────────┤\n", | |
| "│ 1 │ 1 │ 2 │ c │ \u001b[90mmissing\u001b[39m │\n", | |
| "│ 2 │ C|D │ 16 │ x │ \u001b[90mmissing\u001b[39m │" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# specify types of the columns\n", | |
| "CSV.read(IOBuffer(data), DataFrame; types=Dict(:c2=>String, :c4=>Int64))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "You can silence these warnings by `silencewarnings=true`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>Int64?</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td><em>missing</em></td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td><em>missing</em></td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|cccc}\n", | |
| "\t& c1 & c2 & c3 & c4\\\\\n", | |
| "\t\\hline\n", | |
| "\t& String & String & String & Int64?\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2 & c & \\emph{missing} \\\\\n", | |
| "\t2 & C|D & 16 & x & \\emph{missing} \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "2×4 DataFrame\n", | |
| "│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
| "│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64?\u001b[39m │\n", | |
| "├─────┼────────┼────────┼────────┼─────────┤\n", | |
| "│ 1 │ 1 │ 2 │ c │ \u001b[90mmissing\u001b[39m │\n", | |
| "│ 2 │ C|D │ 16 │ x │ \u001b[90mmissing\u001b[39m │" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# specify types of the columns\n", | |
| "CSV.read(IOBuffer(data), DataFrame; types=Dict(:c2=>String, :c4=>Int64), silencewarnings=true)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>Float32</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|cccc}\n", | |
| "\t& c1 & c2 & c3 & c4\\\\\n", | |
| "\t\\hline\n", | |
| "\t& String & String & String & Float32\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
| "\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "2×4 DataFrame\n", | |
| "│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
| "│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mFloat32\u001b[39m │\n", | |
| "├─────┼────────┼────────┼────────┼─────────┤\n", | |
| "│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
| "│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# specify valid type for columns\n", | |
| "CSV.read(IOBuffer(data), DataFrame; types=Dict(:c2=>String, :c4=>Float32))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Or specify types for all columns using a **Vector**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>Int64</th><th>String</th><th>Float64</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|cccc}\n", | |
| "\t& c1 & c2 & c3 & c4\\\\\n", | |
| "\t\\hline\n", | |
| "\t& String & Int64 & String & Float64\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
| "\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "2×4 DataFrame\n", | |
| "│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
| "│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", | |
| "├─────┼────────┼───────┼────────┼─────────┤\n", | |
| "│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
| "│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "types = Array{DataType,1}([String, Int, String, Float64])\n", | |
| "CSV.read(IOBuffer(data), DataFrame; types=types)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>Int32</th><th>String</th><th>Float32</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
| ], | |
| "text/latex": [ | |
| "\\begin{tabular}{r|cccc}\n", | |
| "\t& c1 & c2 & c3 & c4\\\\\n", | |
| "\t\\hline\n", | |
| "\t& String & Int32 & String & Float32\\\\\n", | |
| "\t\\hline\n", | |
| "\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
| "\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/plain": [ | |
| "2×4 DataFrame\n", | |
| "│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
| "│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt32\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mFloat32\u001b[39m │\n", | |
| "├─────┼────────┼───────┼────────┼─────────┤\n", | |
| "│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
| "│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "CSV.read(IOBuffer(data), DataFrame; types=[String, Int32, String, Float32])" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Julia 1.4.1", | |
| "language": "julia", | |
| "name": "julia-1.4" | |
| }, | |
| "language_info": { | |
| "file_extension": ".jl", | |
| "mimetype": "application/julia", | |
| "name": "julia", | |
| "version": "1.4.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment