Created
September 23, 2021 04:32
-
-
Save amdevine/ad17bf2600c764bd46a6f03b7a6b97c9 to your computer and use it in GitHub Desktop.
Encode dummy variables with dplyr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Encode dummy variables with dplyr\n", | |
| "\n", | |
| "This notebook contains one example of encoding dummy variables in R without numerous if-else statements." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "library(dplyr, warn.conflicts = FALSE)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Example data frame" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"dataframe\">\n", | |
| "<caption>A data.frame: 9 × 2</caption>\n", | |
| "<thead>\n", | |
| "\t<tr><th scope=col>temperature</th><th scope=col>weather</th></tr>\n", | |
| "\t<tr><th scope=col><chr></th><th scope=col><chr></th></tr>\n", | |
| "</thead>\n", | |
| "<tbody>\n", | |
| "\t<tr><td>Low </td><td>Sun </td></tr>\n", | |
| "\t<tr><td>Low </td><td>Clouds</td></tr>\n", | |
| "\t<tr><td>Low </td><td>Rain </td></tr>\n", | |
| "\t<tr><td>Med </td><td>Sun </td></tr>\n", | |
| "\t<tr><td>Med </td><td>Clouds</td></tr>\n", | |
| "\t<tr><td>Med </td><td>Rain </td></tr>\n", | |
| "\t<tr><td>High</td><td>Sun </td></tr>\n", | |
| "\t<tr><td>High</td><td>Clouds</td></tr>\n", | |
| "\t<tr><td>High</td><td>Rain </td></tr>\n", | |
| "</tbody>\n", | |
| "</table>\n" | |
| ], | |
| "text/latex": [ | |
| "A data.frame: 9 × 2\n", | |
| "\\begin{tabular}{ll}\n", | |
| " temperature & weather\\\\\n", | |
| " <chr> & <chr>\\\\\n", | |
| "\\hline\n", | |
| "\t Low & Sun \\\\\n", | |
| "\t Low & Clouds\\\\\n", | |
| "\t Low & Rain \\\\\n", | |
| "\t Med & Sun \\\\\n", | |
| "\t Med & Clouds\\\\\n", | |
| "\t Med & Rain \\\\\n", | |
| "\t High & Sun \\\\\n", | |
| "\t High & Clouds\\\\\n", | |
| "\t High & Rain \\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/markdown": [ | |
| "\n", | |
| "A data.frame: 9 × 2\n", | |
| "\n", | |
| "| temperature <chr> | weather <chr> |\n", | |
| "|---|---|\n", | |
| "| Low | Sun |\n", | |
| "| Low | Clouds |\n", | |
| "| Low | Rain |\n", | |
| "| Med | Sun |\n", | |
| "| Med | Clouds |\n", | |
| "| Med | Rain |\n", | |
| "| High | Sun |\n", | |
| "| High | Clouds |\n", | |
| "| High | Rain |\n", | |
| "\n" | |
| ], | |
| "text/plain": [ | |
| " temperature weather\n", | |
| "1 Low Sun \n", | |
| "2 Low Clouds \n", | |
| "3 Low Rain \n", | |
| "4 Med Sun \n", | |
| "5 Med Clouds \n", | |
| "6 Med Rain \n", | |
| "7 High Sun \n", | |
| "8 High Clouds \n", | |
| "9 High Rain " | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "df <- data.frame(temperature = rep(c('Low', 'Med', 'High'), each = 3),\n", | |
| " weather = rep(c('Sun', 'Clouds', 'Rain'), 3))\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Create dummy variables\n", | |
| "\n", | |
| "Create dummy columns that contain boolean values. Since boolean values are stored as 1 = TRUE and 0 = FALSE, these columns can just be converted to numeric to get our encoded dummy variables.\n", | |
| "\n", | |
| "`dplyr::across()` applies a function to each column in a specified range of columns.\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table class=\"dataframe\">\n", | |
| "<caption>A data.frame: 9 × 6</caption>\n", | |
| "<thead>\n", | |
| "\t<tr><th scope=col>temperature</th><th scope=col>weather</th><th scope=col>med</th><th scope=col>high</th><th scope=col>clouds</th><th scope=col>rain</th></tr>\n", | |
| "\t<tr><th scope=col><chr></th><th scope=col><chr></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th></tr>\n", | |
| "</thead>\n", | |
| "<tbody>\n", | |
| "\t<tr><td>Low </td><td>Sun </td><td>0</td><td>0</td><td>0</td><td>0</td></tr>\n", | |
| "\t<tr><td>Low </td><td>Clouds</td><td>0</td><td>0</td><td>1</td><td>0</td></tr>\n", | |
| "\t<tr><td>Low </td><td>Rain </td><td>0</td><td>0</td><td>0</td><td>1</td></tr>\n", | |
| "\t<tr><td>Med </td><td>Sun </td><td>1</td><td>0</td><td>0</td><td>0</td></tr>\n", | |
| "\t<tr><td>Med </td><td>Clouds</td><td>1</td><td>0</td><td>1</td><td>0</td></tr>\n", | |
| "\t<tr><td>Med </td><td>Rain </td><td>1</td><td>0</td><td>0</td><td>1</td></tr>\n", | |
| "\t<tr><td>High</td><td>Sun </td><td>0</td><td>1</td><td>0</td><td>0</td></tr>\n", | |
| "\t<tr><td>High</td><td>Clouds</td><td>0</td><td>1</td><td>1</td><td>0</td></tr>\n", | |
| "\t<tr><td>High</td><td>Rain </td><td>0</td><td>1</td><td>0</td><td>1</td></tr>\n", | |
| "</tbody>\n", | |
| "</table>\n" | |
| ], | |
| "text/latex": [ | |
| "A data.frame: 9 × 6\n", | |
| "\\begin{tabular}{llllll}\n", | |
| " temperature & weather & med & high & clouds & rain\\\\\n", | |
| " <chr> & <chr> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n", | |
| "\\hline\n", | |
| "\t Low & Sun & 0 & 0 & 0 & 0\\\\\n", | |
| "\t Low & Clouds & 0 & 0 & 1 & 0\\\\\n", | |
| "\t Low & Rain & 0 & 0 & 0 & 1\\\\\n", | |
| "\t Med & Sun & 1 & 0 & 0 & 0\\\\\n", | |
| "\t Med & Clouds & 1 & 0 & 1 & 0\\\\\n", | |
| "\t Med & Rain & 1 & 0 & 0 & 1\\\\\n", | |
| "\t High & Sun & 0 & 1 & 0 & 0\\\\\n", | |
| "\t High & Clouds & 0 & 1 & 1 & 0\\\\\n", | |
| "\t High & Rain & 0 & 1 & 0 & 1\\\\\n", | |
| "\\end{tabular}\n" | |
| ], | |
| "text/markdown": [ | |
| "\n", | |
| "A data.frame: 9 × 6\n", | |
| "\n", | |
| "| temperature <chr> | weather <chr> | med <dbl> | high <dbl> | clouds <dbl> | rain <dbl> |\n", | |
| "|---|---|---|---|---|---|\n", | |
| "| Low | Sun | 0 | 0 | 0 | 0 |\n", | |
| "| Low | Clouds | 0 | 0 | 1 | 0 |\n", | |
| "| Low | Rain | 0 | 0 | 0 | 1 |\n", | |
| "| Med | Sun | 1 | 0 | 0 | 0 |\n", | |
| "| Med | Clouds | 1 | 0 | 1 | 0 |\n", | |
| "| Med | Rain | 1 | 0 | 0 | 1 |\n", | |
| "| High | Sun | 0 | 1 | 0 | 0 |\n", | |
| "| High | Clouds | 0 | 1 | 1 | 0 |\n", | |
| "| High | Rain | 0 | 1 | 0 | 1 |\n", | |
| "\n" | |
| ], | |
| "text/plain": [ | |
| " temperature weather med high clouds rain\n", | |
| "1 Low Sun 0 0 0 0 \n", | |
| "2 Low Clouds 0 0 1 0 \n", | |
| "3 Low Rain 0 0 0 1 \n", | |
| "4 Med Sun 1 0 0 0 \n", | |
| "5 Med Clouds 1 0 1 0 \n", | |
| "6 Med Rain 1 0 0 1 \n", | |
| "7 High Sun 0 1 0 0 \n", | |
| "8 High Clouds 0 1 1 0 \n", | |
| "9 High Rain 0 1 0 1 " | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "df <- df %>%\n", | |
| " mutate(med = temperature == 'Med',\n", | |
| " high = temperature == 'High',\n", | |
| " clouds = weather == 'Clouds',\n", | |
| " rain = weather == 'Rain',\n", | |
| " across(med:rain, as.numeric))\n", | |
| "df" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "R", | |
| "language": "R", | |
| "name": "ir" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": "r", | |
| "file_extension": ".r", | |
| "mimetype": "text/x-r-source", | |
| "name": "R", | |
| "pygments_lexer": "r", | |
| "version": "4.0.4" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment