Skip to content

Instantly share code, notes, and snippets.

@MNoorFawi
Created May 29, 2020 22:12
Show Gist options
  • Select an option

  • Save MNoorFawi/e79886245e1f9d527998d415c6d4cb31 to your computer and use it in GitHub Desktop.

Select an option

Save MNoorFawi/e79886245e1f9d527998d415c6d4cb31 to your computer and use it in GitHub Desktop.
preprocessing train and test data frames in Julia (v0.6.4)
# one hot encoding string columns and normalizing numeric ones
# the function prepares new coming and/or test dataframe using existing/training dataframe
function preprocess(new::DataFrame, old::DataFrame)
dataType = describe(old)
x = DataFrame()
d = DataFrame()
str = dataType[dataType[:eltype] .== String, :variable]
num = dataType[(dataType[:eltype] .== Float64) .| (dataType[:eltype] .== Int64), :variable]
str = setdiff(str, [names(old)[end]])
for i in str
dict = unique(old[:, i])
for key in dict
x[:, [Symbol(key)]] = map(Float32, 1.0(new[:, i] .== key))
end
end
for i in num
d[:, i] = map(Float32, (new[:, i]- minimum(new[:, i])) / (maximum(new[:, i]) - minimum(new[:, i])))
end
x = hcat(x, d)
# if there is a binary target column that needs to be converted to 1 and 0
#x[:y] = map(UInt8, (new[end] .== "pos") .| (new[end] .== "neg"))
return x
end;
# run it on train based on itself and on test with train as a reference
#encoded_train = preprocess(trn, trn);
#encoded_test = preprocess(tst, trn);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment