raov5 · September 13, 2017 23:05
diff --git a/Basic Statistics in R.ipynb b/Basic Statistics in R.ipynb
 {
    "metadata": {
        "kernelspec": {
            "language": "R", 
            "display_name": "R with Spark 2.0", 
            "name": "r-spark20"
        }, 
        "language_info": {
            "version": "3.3.2", 
            "file_extension": ".r", 
            "mimetype": "text/x-r-source", 
            "name": "R", 
            "codemirror_mode": "r", 
            "pygments_lexer": "r"
        }
    }, 
    "cells": [
        {
            "execution_count": 70, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#@author: Venky Rao raove@us.ibm.com\n#@last edited: 13 Sep 2017\n#@source: materials, data and examples adapted from R in Action 2nd Edition by Dr. Robert Kabacoff"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "# Basic Statistics"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## Descriptive statistics"
        }, 
        {
            "execution_count": 2, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|lll}\n  & mpg & hp & wt\\\\\n\\hline\n\tMazda RX4 & 21.0  & 110   & 2.620\\\\\n\tMazda RX4 Wag & 21.0  & 110   & 2.875\\\\\n\tDatsun 710 & 22.8  &  93   & 2.320\\\\\n\tHornet 4 Drive & 21.4  & 110   & 3.215\\\\\n\tHornet Sportabout & 18.7  & 175   & 3.440\\\\\n\tValiant & 18.1  & 105   & 3.460\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>mpg</th><th scope=col>hp</th><th scope=col>wt</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Mazda RX4</th><td>21.0 </td><td>110  </td><td>2.620</td></tr>\n\t<tr><th scope=row>Mazda RX4 Wag</th><td>21.0 </td><td>110  </td><td>2.875</td></tr>\n\t<tr><th scope=row>Datsun 710</th><td>22.8 </td><td> 93  </td><td>2.320</td></tr>\n\t<tr><th scope=row>Hornet 4 Drive</th><td>21.4 </td><td>110  </td><td>3.215</td></tr>\n\t<tr><th scope=row>Hornet Sportabout</th><td>18.7 </td><td>175  </td><td>3.440</td></tr>\n\t<tr><th scope=row>Valiant</th><td>18.1 </td><td>105  </td><td>3.460</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "                  mpg  hp  wt   \nMazda RX4         21.0 110 2.620\nMazda RX4 Wag     21.0 110 2.875\nDatsun 710        22.8  93 2.320\nHornet 4 Drive    21.4 110 3.215\nHornet Sportabout 18.7 175 3.440\nValiant           18.1 105 3.460"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#in this section we will review measures of central tendency, variablity and distribution shape\n#for continuous variables\n#we will use the mtcars dataset that ships with the base installation of R\nmyvars <- c(\"mpg\", \"hp\", \"wt\")\nhead(mtcars[myvars])"
        }, 
        {
            "execution_count": 3, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#first we will look at descriptive statistics for all 32 cars.  Then, we will examine\n#descriptive statistics by transmission type (am) and number of cylinders (cyl)\n#transmission type is a dichotomous variable coded 0 = automatic, 1 = manual\n#number of cylinders can be 4, 5 or 6"
        }, 
        {
            "execution_count": 4, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "      mpg              hp              wt       \n Min.   :10.40   Min.   : 52.0   Min.   :1.513  \n 1st Qu.:15.43   1st Qu.: 96.5   1st Qu.:2.581  \n Median :19.20   Median :123.0   Median :3.325  \n Mean   :20.09   Mean   :146.7   Mean   :3.217  \n 3rd Qu.:22.80   3rd Qu.:180.0   3rd Qu.:3.610  \n Max.   :33.90   Max.   :335.0   Max.   :5.424  "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#in the base installation of R, you can use the summary() function to obtain\n#descriptive statistics\nsummary(mtcars[myvars])"
        }, 
        {
            "execution_count": 5, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#creating a user defined function for summary statistics\nmystats <- function(x, na.omit = F) {\n    if (na.omit) {\n        x <- x[!is.na(x)]\n    }\n    m <- mean(x)\n    n <- length(x)\n    s <- sd(x)\n    skew <- sum((x - m)^3 / s^3) / n\n    kurt <- sum((x - m)^4 / s^4) / n - 3\n    return (c(n = n, mean = m, stdev = s, skew = skew, kurtosis = kurt))\n}"
        }, 
        {
            "execution_count": 6, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|lll}\n  & mpg & hp & wt\\\\\n\\hline\n\tn & 32.000  &  32.000 & 32.0000\\\\\n\tmean & 20.091  & 146.688 &  3.2172\\\\\n\tstdev &  6.027  &  68.563 &  0.9785\\\\\n\tskew &  0.611  &   0.726 &  0.4231\\\\\n\tkurtosis & -0.373  &  -0.136 & -0.0227\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>mpg</th><th scope=col>hp</th><th scope=col>wt</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>n</th><td>32.000 </td><td> 32.000</td><td>32.0000</td></tr>\n\t<tr><th scope=row>mean</th><td>20.091 </td><td>146.688</td><td> 3.2172</td></tr>\n\t<tr><th scope=row>stdev</th><td> 6.027 </td><td> 68.563</td><td> 0.9785</td></tr>\n\t<tr><th scope=row>skew</th><td> 0.611 </td><td>  0.726</td><td> 0.4231</td></tr>\n\t<tr><th scope=row>kurtosis</th><td>-0.373 </td><td> -0.136</td><td>-0.0227</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "         mpg    hp      wt     \nn        32.000  32.000 32.0000\nmean     20.091 146.688  3.2172\nstdev     6.027  68.563  0.9785\nskew      0.611   0.726  0.4231\nkurtosis -0.373  -0.136 -0.0227", 
                        "text/markdown": "1. 32\n2. 20.090625\n3. 6.0269480520891\n4. 0.610655017573288\n5. -0.372766029820891\n6. 32\n7. 146.6875\n8. 68.5628684893206\n9. 0.726023656361273\n10. -0.135551121097421\n11. 32\n12. 3.21725\n13. 0.978457442989697\n14. 0.423146464177225\n15. -0.0227107528393127\n\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#applying our user defined function to our dataset\noptions(digits = 3) #keep decimal places to 3 to enhance readability\nsapply(mtcars[myvars], mystats)"
        }, 
        {
            "execution_count": 7, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Loading required package: lattice\n\nAttaching package: \u2018lattice\u2019\n\nThe following object is masked from \u2018package:SparkR\u2019:\n\n    histogram\n\nLoading required package: survival\nLoading required package: Formula\nLoading required package: ggplot2\n\nAttaching package: \u2018Hmisc\u2019\n\nThe following objects are masked from \u2018package:SparkR\u2019:\n\n    ceil, describe, summarize, translate\n\nThe following objects are masked from \u2018package:base\u2019:\n\n    format.pval, round.POSIXt, trunc.POSIXt, units\n\n"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "mtcars[myvars] \n\n 3  Variables      32  Observations\n--------------------------------------------------------------------------------\nmpg \n       n  missing distinct     Info     Mean      Gmd      .05      .10 \n      32        0       25    0.999    20.09    6.796    12.00    14.34 \n     .25      .50      .75      .90      .95 \n   15.43    19.20    22.80    30.09    31.30 \n\nlowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9\n--------------------------------------------------------------------------------\nhp \n       n  missing distinct     Info     Mean      Gmd      .05      .10 \n      32        0       22    0.997    146.7    77.04    63.65    66.00 \n     .25      .50      .75      .90      .95 \n   96.50   123.00   180.00   243.50   253.55 \n\nlowest :  52  62  65  66  91, highest: 215 230 245 264 335\n--------------------------------------------------------------------------------\nwt \n       n  missing distinct     Info     Mean      Gmd      .05      .10 \n      32        0       29    0.999    3.217    1.089    1.736    1.956 \n     .25      .50      .75      .90      .95 \n   2.581    3.325    3.610    4.048    5.293 \n\nlowest : 1.51 1.61 1.83 1.94 2.14, highest: 3.85 4.07 5.25 5.34 5.42\n--------------------------------------------------------------------------------"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#descriptive statistics via the describe() function in the Hmisc package\nlibrary(Hmisc) #load the Hmisc package\ndescribe(mtcars[myvars])"
        }, 
        {
            "execution_count": 8, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Installing package into \u2018/gpfs/global_fs01/sym_shared/YPProdSpark/user/s17c-9f3318fc11f06c-d37a4b9405b6/R/libs\u2019\n(as \u2018lib\u2019 is unspecified)\nLoading required package: boot\n\nAttaching package: \u2018boot\u2019\n\nThe following object is masked from \u2018package:survival\u2019:\n\n    aml\n\nThe following object is masked from \u2018package:lattice\u2019:\n\n    melanoma\n\nThe following object is masked from \u2018package:SparkR\u2019:\n\n    corr\n\n\nAttaching package: \u2018pastecs\u2019\n\nThe following objects are masked from \u2018package:SparkR\u2019:\n\n    first, last\n\n"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|lll}\n  & mpg & hp & wt\\\\\n\\hline\n\tnbr.val &  32.00   &   32.000 &  32.000 \\\\\n\tnbr.null &   0.00   &    0.000 &   0.000 \\\\\n\tnbr.na &   0.00   &    0.000 &   0.000 \\\\\n\tmin &  10.40   &   52.000 &   1.513 \\\\\n\tmax &  33.90   &  335.000 &   5.424 \\\\\n\trange &  23.50   &  283.000 &   3.911 \\\\\n\tsum & 642.90   & 4694.000 & 102.952 \\\\\n\tmedian &  19.20   &  123.000 &   3.325 \\\\\n\tmean &  20.09   &  146.688 &   3.217 \\\\\n\tSE.mean &   1.07   &   12.120 &   0.173 \\\\\n\tCI.mean.0.95 &   2.17   &   24.720 &   0.353 \\\\\n\tvar &  36.32   & 4700.867 &   0.957 \\\\\n\tstd.dev &   6.03   &   68.563 &   0.978 \\\\\n\tcoef.var &   0.30   &    0.467 &   0.304 \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>mpg</th><th scope=col>hp</th><th scope=col>wt</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>nbr.val</th><td> 32.00  </td><td>  32.000</td><td> 32.000 </td></tr>\n\t<tr><th scope=row>nbr.null</th><td>  0.00  </td><td>   0.000</td><td>  0.000 </td></tr>\n\t<tr><th scope=row>nbr.na</th><td>  0.00  </td><td>   0.000</td><td>  0.000 </td></tr>\n\t<tr><th scope=row>min</th><td> 10.40  </td><td>  52.000</td><td>  1.513 </td></tr>\n\t<tr><th scope=row>max</th><td> 33.90  </td><td> 335.000</td><td>  5.424 </td></tr>\n\t<tr><th scope=row>range</th><td> 23.50  </td><td> 283.000</td><td>  3.911 </td></tr>\n\t<tr><th scope=row>sum</th><td>642.90  </td><td>4694.000</td><td>102.952 </td></tr>\n\t<tr><th scope=row>median</th><td> 19.20  </td><td> 123.000</td><td>  3.325 </td></tr>\n\t<tr><th scope=row>mean</th><td> 20.09  </td><td> 146.688</td><td>  3.217 </td></tr>\n\t<tr><th scope=row>SE.mean</th><td>  1.07  </td><td>  12.120</td><td>  0.173 </td></tr>\n\t<tr><th scope=row>CI.mean.0.95</th><td>  2.17  </td><td>  24.720</td><td>  0.353 </td></tr>\n\t<tr><th scope=row>var</th><td> 36.32  </td><td>4700.867</td><td>  0.957 </td></tr>\n\t<tr><th scope=row>std.dev</th><td>  6.03  </td><td>  68.563</td><td>  0.978 </td></tr>\n\t<tr><th scope=row>coef.var</th><td>  0.30  </td><td>   0.467</td><td>  0.304 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "             mpg    hp       wt     \nnbr.val       32.00   32.000  32.000\nnbr.null       0.00    0.000   0.000\nnbr.na         0.00    0.000   0.000\nmin           10.40   52.000   1.513\nmax           33.90  335.000   5.424\nrange         23.50  283.000   3.911\nsum          642.90 4694.000 102.952\nmedian        19.20  123.000   3.325\nmean          20.09  146.688   3.217\nSE.mean        1.07   12.120   0.173\nCI.mean.0.95   2.17   24.720   0.353\nvar           36.32 4700.867   0.957\nstd.dev        6.03   68.563   0.978\ncoef.var       0.30    0.467   0.304"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#descriptive statistics via the stat.desc() function in the pastecs package\ninstall.packages(\"pastecs\") #install pastecs\nlibrary(pastecs) #load the pastecs package\nstat.desc(mtcars[myvars])"
        }, 
        {
            "execution_count": 9, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "\nAttaching package: \u2018psych\u2019\n\nThe following object is masked from \u2018package:boot\u2019:\n\n    logit\n\nThe following object is masked from \u2018package:Hmisc\u2019:\n\n    describe\n\nThe following objects are masked from \u2018package:ggplot2\u2019:\n\n    %+%, alpha\n\nThe following object is masked from \u2018package:SparkR\u2019:\n\n    describe\n\n"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|lllllllllllll}\n  & vars & n & mean & sd & median & trimmed & mad & min & max & range & skew & kurtosis & se\\\\\n\\hline\n\tmpg & 1       & 32      &  20.09  &  6.027  &  19.20  &  19.70  &  5.411  & 10.40   &  33.90  &  23.50  & 0.611   & -0.3728 &  1.065 \\\\\n\thp & 2       & 32      & 146.69  & 68.563  & 123.00  & 141.19  & 77.095  & 52.00   & 335.00  & 283.00  & 0.726   & -0.1356 & 12.120 \\\\\n\twt & 3       & 32      &   3.22  &  0.978  &   3.33  &   3.15  &  0.767  &  1.51   &   5.42  &   3.91  & 0.423   & -0.0227 &  0.173 \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>vars</th><th scope=col>n</th><th scope=col>mean</th><th scope=col>sd</th><th scope=col>median</th><th scope=col>trimmed</th><th scope=col>mad</th><th scope=col>min</th><th scope=col>max</th><th scope=col>range</th><th scope=col>skew</th><th scope=col>kurtosis</th><th scope=col>se</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>mpg</th><td>1      </td><td>32     </td><td> 20.09 </td><td> 6.027 </td><td> 19.20 </td><td> 19.70 </td><td> 5.411 </td><td>10.40  </td><td> 33.90 </td><td> 23.50 </td><td>0.611  </td><td>-0.3728</td><td> 1.065 </td></tr>\n\t<tr><th scope=row>hp</th><td>2      </td><td>32     </td><td>146.69 </td><td>68.563 </td><td>123.00 </td><td>141.19 </td><td>77.095 </td><td>52.00  </td><td>335.00 </td><td>283.00 </td><td>0.726  </td><td>-0.1356</td><td>12.120 </td></tr>\n\t<tr><th scope=row>wt</th><td>3      </td><td>32     </td><td>  3.22 </td><td> 0.978 </td><td>  3.33 </td><td>  3.15 </td><td> 0.767 </td><td> 1.51  </td><td>  5.42 </td><td>  3.91 </td><td>0.423  </td><td>-0.0227</td><td> 0.173 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "    vars n  mean   sd     median trimmed mad    min   max    range  skew \nmpg 1    32  20.09  6.027  19.20  19.70   5.411 10.40  33.90  23.50 0.611\nhp  2    32 146.69 68.563 123.00 141.19  77.095 52.00 335.00 283.00 0.726\nwt  3    32   3.22  0.978   3.33   3.15   0.767  1.51   5.42   3.91 0.423\n    kurtosis se    \nmpg -0.3728   1.065\nhp  -0.1356  12.120\nwt  -0.0227   0.173"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#descriptive statistics via the describe() function in the psych package\nlibrary(psych) #load the psych package\ndescribe(mtcars[myvars])"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Descriptive statistics by group"
        }, 
        {
            "execution_count": 10, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llll}\n am & mpg & hp & wt\\\\\n\\hline\n\t 0    & 17.1 & 160  & 3.77\\\\\n\t 1    & 24.4 & 127  & 2.41\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th scope=col>am</th><th scope=col>mpg</th><th scope=col>hp</th><th scope=col>wt</th></tr></thead>\n<tbody>\n\t<tr><td>0   </td><td>17.1</td><td>160 </td><td>3.77</td></tr>\n\t<tr><td>1   </td><td>24.4</td><td>127 </td><td>2.41</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  am mpg  hp  wt  \n1 0  17.1 160 3.77\n2 1  24.4 127 2.41"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#descriptive statistics by group using aggregate()\n#example 1: mean\naggregate(mtcars[myvars], by = list(am = mtcars$am), mean)"
        }, 
        {
            "execution_count": 11, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llll}\n am & mpg & hp & wt\\\\\n\\hline\n\t 0     & 3.83  & 53.9  & 0.777\\\\\n\t 1     & 6.17  & 84.1  & 0.617\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th scope=col>am</th><th scope=col>mpg</th><th scope=col>hp</th><th scope=col>wt</th></tr></thead>\n<tbody>\n\t<tr><td>0    </td><td>3.83 </td><td>53.9 </td><td>0.777</td></tr>\n\t<tr><td>1    </td><td>6.17 </td><td>84.1 </td><td>0.617</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  am mpg  hp   wt   \n1 0  3.83 53.9 0.777\n2 1  6.17 84.1 0.617"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#descriptive statistics by group using aggregate()\n#example 2: standard deviation\naggregate(mtcars[myvars], by = list(am = mtcars$am), sd)"
        }, 
        {
            "execution_count": 12, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "mtcars$am: 0\n            mpg       hp     wt\nn        19.000  19.0000 19.000\nmean     17.147 160.2632  3.769\nstdev     3.834  53.9082  0.777\nskew      0.014  -0.0142  0.976\nkurtosis -0.803  -1.2097  0.142\n------------------------------------------------------------ \nmtcars$am: 1\n             mpg      hp     wt\nn        13.0000  13.000 13.000\nmean     24.3923 126.846  2.411\nstdev     6.1665  84.062  0.617\nskew      0.0526   1.360  0.210\nkurtosis -1.4554   0.563 -1.174"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#descriptive statistics by group using by()\ndstats <- function(x) { #create a user-defined function\n    sapply(x, mystats)\n}\nby(mtcars[myvars], mtcars$am, dstats)"
        }, 
        {
            "execution_count": 13, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Installing package into \u2018/gpfs/global_fs01/sym_shared/YPProdSpark/user/s17c-9f3318fc11f06c-d37a4b9405b6/R/libs\u2019\n(as \u2018lib\u2019 is unspecified)\n\nAttaching package: \u2018doBy\u2019\n\nThe following objects are masked from \u2018package:SparkR\u2019:\n\n    orderBy, sampleBy\n\n"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llllllllllllllll}\n am & mpg.n & mpg.mean & mpg.stdev & mpg.skew & mpg.kurtosis & hp.n & hp.mean & hp.stdev & hp.skew & hp.kurtosis & wt.n & wt.mean & wt.stdev & wt.skew & wt.kurtosis\\\\\n\\hline\n\t 0       & 19      & 17.1    & 3.83    & 0.0140  & -0.803  & 19      & 160     & 53.9    & -0.0142 & -1.210  & 19      & 3.77    & 0.777   & 0.976   &  0.142 \\\\\n\t 1       & 13      & 24.4    & 6.17    & 0.0526  & -1.455  & 13      & 127     & 84.1    &  1.3599 &  0.563  & 13      & 2.41    & 0.617   & 0.210   & -1.174 \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th scope=col>am</th><th scope=col>mpg.n</th><th scope=col>mpg.mean</th><th scope=col>mpg.stdev</th><th scope=col>mpg.skew</th><th scope=col>mpg.kurtosis</th><th scope=col>hp.n</th><th scope=col>hp.mean</th><th scope=col>hp.stdev</th><th scope=col>hp.skew</th><th scope=col>hp.kurtosis</th><th scope=col>wt.n</th><th scope=col>wt.mean</th><th scope=col>wt.stdev</th><th scope=col>wt.skew</th><th scope=col>wt.kurtosis</th></tr></thead>\n<tbody>\n\t<tr><td>0      </td><td>19     </td><td>17.1   </td><td>3.83   </td><td>0.0140 </td><td>-0.803 </td><td>19     </td><td>160    </td><td>53.9   </td><td>-0.0142</td><td>-1.210 </td><td>19     </td><td>3.77   </td><td>0.777  </td><td>0.976  </td><td> 0.142 </td></tr>\n\t<tr><td>1      </td><td>13     </td><td>24.4   </td><td>6.17   </td><td>0.0526 </td><td>-1.455 </td><td>13     </td><td>127    </td><td>84.1   </td><td> 1.3599</td><td> 0.563 </td><td>13     </td><td>2.41   </td><td>0.617  </td><td>0.210  </td><td>-1.174 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev\n1 0  19    17.1     3.83      0.0140   -0.803       19   160     53.9    \n2 1  13    24.4     6.17      0.0526   -1.455       13   127     84.1    \n  hp.skew hp.kurtosis wt.n wt.mean wt.stdev wt.skew wt.kurtosis\n1 -0.0142 -1.210      19   3.77    0.777    0.976    0.142     \n2  1.3599  0.563      13   2.41    0.617    0.210   -1.174     "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#summary statistics by group using the summaryBy() function in the doBy package\ninstall.packages(\"doBy\") #install doBy\nlibrary(doBy) #load the pastecs package\nsummaryBy(mpg + hp + wt ~ am, data = mtcars, FUN = mystats)"
        }, 
        {
            "execution_count": 14, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "$`0`\n    vars  n   mean    sd median trimmed   mad   min    max  range  skew\nmpg    1 19  17.15  3.83  17.30   17.12  3.11 10.40  24.40  14.00  0.01\nhp     2 19 160.26 53.91 175.00  161.06 77.10 62.00 245.00 183.00 -0.01\nwt     3 19   3.77  0.78   3.52    3.75  0.45  2.46   5.42   2.96  0.98\n    kurtosis    se\nmpg    -0.80  0.88\nhp     -1.21 12.37\nwt      0.14  0.18\n\n$`1`\n    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis\nmpg    1 13  24.39  6.17  22.80   24.38  6.67 15.00  33.90  18.90 0.05    -1.46\nhp     2 13 126.85 84.06 109.00  114.73 63.75 52.00 335.00 283.00 1.36     0.56\nwt     3 13   2.41  0.62   2.32    2.39  0.68  1.51   3.57   2.06 0.21    -1.17\n       se\nmpg  1.71\nhp  23.31\nwt   0.17\n\nattr(,\"call\")\nby.data.frame(data = x, INDICES = group, FUN = describe, type = type)"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#summary statistics by group using the describe.by() function in the psych package\nlibrary(psych) #load the psych package\ndescribeBy(mtcars[myvars], list(am = mtcars$am))"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## Frequency and contingency tables"
        }, 
        {
            "execution_count": 15, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Loading required package: grid\n\nAttaching package: \u2018grid\u2019\n\nThe following object is masked from \u2018package:SparkR\u2019:\n\n    explode\n\n"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|lllll}\n ID & Treatment & Sex & Age & Improved\\\\\n\\hline\n\t 57      & Treated & Male    & 27      & Some   \\\\\n\t 46      & Treated & Male    & 29      & None   \\\\\n\t 77      & Treated & Male    & 30      & None   \\\\\n\t 17      & Treated & Male    & 32      & Marked \\\\\n\t 36      & Treated & Male    & 46      & Marked \\\\\n\t 23      & Treated & Male    & 58      & Marked \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th scope=col>ID</th><th scope=col>Treatment</th><th scope=col>Sex</th><th scope=col>Age</th><th scope=col>Improved</th></tr></thead>\n<tbody>\n\t<tr><td>57     </td><td>Treated</td><td>Male   </td><td>27     </td><td>Some   </td></tr>\n\t<tr><td>46     </td><td>Treated</td><td>Male   </td><td>29     </td><td>None   </td></tr>\n\t<tr><td>77     </td><td>Treated</td><td>Male   </td><td>30     </td><td>None   </td></tr>\n\t<tr><td>17     </td><td>Treated</td><td>Male   </td><td>32     </td><td>Marked </td></tr>\n\t<tr><td>36     </td><td>Treated</td><td>Male   </td><td>46     </td><td>Marked </td></tr>\n\t<tr><td>23     </td><td>Treated</td><td>Male   </td><td>58     </td><td>Marked </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "  ID Treatment Sex  Age Improved\n1 57 Treated   Male 27  Some    \n2 46 Treated   Male 29  None    \n3 77 Treated   Male 30  None    \n4 17 Treated   Male 32  Marked  \n5 36 Treated   Male 46  Marked  \n6 23 Treated   Male 58  Marked  "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#in this section we will look at frequency and contingency tables from categorical variables,\n#along with tests of independence, measures of association, and methods for graphically displaying results\n#we will use the Arthritis dataset from the vcd package\nlibrary(vcd) #load the vcd package\nhead(Arthritis) #display the first 6 rows"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Generating frequency tables"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### One way tables"
        }, 
        {
            "execution_count": 16, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Improved\n  None   Some Marked \n    42     14     28 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "mytable <- with(Arthritis, table(Improved))\nmytable"
        }, 
        {
            "execution_count": 17, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Improved\n  None   Some Marked \n 0.500  0.167  0.333 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#convert frequencies into proportions using prop.table():\nprop.table(mytable)"
        }, 
        {
            "execution_count": 18, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Improved\n  None   Some Marked \n  50.0   16.7   33.3 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#convert frequencies into percentages using prop.table() * 100:\nprop.table(mytable) * 100"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Two way tables"
        }, 
        {
            "execution_count": 19, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "         Improved\nTreatment None Some Marked\n  Placebo   29    7      7\n  Treated   13    7     21"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "mytable <- xtabs(~ Treatment + Improved, data = Arthritis)\nmytable"
        }, 
        {
            "execution_count": 20, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#you can generate marginal frequencies and proportions using the\n#margin.table() and prop.table()functions"
        }, 
        {
            "execution_count": 21, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Treatment\nPlacebo Treated \n     43      41 "
                    }, 
                    "output_type": "display_data"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "         Improved\nTreatment  None  Some Marked\n  Placebo 0.674 0.163  0.163\n  Treated 0.317 0.171  0.512"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#for row sums and row proportions, you have:\nmargin.table(mytable, 1)\nprop.table(mytable, 1)"
        }, 
        {
            "execution_count": 22, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Improved\n  None   Some Marked \n    42     14     28 "
                    }, 
                    "output_type": "display_data"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "         Improved\nTreatment None Some Marked\n  Placebo 0.69 0.50   0.25\n  Treated 0.31 0.50   0.75"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#for column sums and row proportions, you have:\nmargin.table(mytable, 2)\nprop.table(mytable, 2)"
        }, 
        {
            "execution_count": 23, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "         Improved\nTreatment   None   Some Marked\n  Placebo 0.3452 0.0833 0.0833\n  Treated 0.1548 0.0833 0.2500"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#cell proportions are obtained by:\nprop.table(mytable)"
        }, 
        {
            "execution_count": 24, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llll}\n  & None & Some & Marked & Sum\\\\\n\\hline\n\tPlacebo & 29 &  7 &  7 & 43\\\\\n\tTreated & 13 &  7 & 21 & 41\\\\\n\tSum & 42 & 14 & 28 & 84\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>None</th><th scope=col>Some</th><th scope=col>Marked</th><th scope=col>Sum</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Placebo</th><td>29</td><td> 7</td><td> 7</td><td>43</td></tr>\n\t<tr><th scope=row>Treated</th><td>13</td><td> 7</td><td>21</td><td>41</td></tr>\n\t<tr><th scope=row>Sum</th><td>42</td><td>14</td><td>28</td><td>84</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "         Improved\nTreatment None Some Marked Sum\n  Placebo 29    7    7     43 \n  Treated 13    7   21     41 \n  Sum     42   14   28     84 "
                    }, 
                    "output_type": "display_data"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llll}\n  & None & Some & Marked & Sum\\\\\n\\hline\n\tPlacebo & 0.345  & 0.0833 & 0.0833 & 0.512 \\\\\n\tTreated & 0.155  & 0.0833 & 0.2500 & 0.488 \\\\\n\tSum & 0.500  & 0.1667 & 0.3333 & 1.000 \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>None</th><th scope=col>Some</th><th scope=col>Marked</th><th scope=col>Sum</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Placebo</th><td>0.345 </td><td>0.0833</td><td>0.0833</td><td>0.512 </td></tr>\n\t<tr><th scope=row>Treated</th><td>0.155 </td><td>0.0833</td><td>0.2500</td><td>0.488 </td></tr>\n\t<tr><th scope=row>Sum</th><td>0.500 </td><td>0.1667</td><td>0.3333</td><td>1.000 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "         Improved\nTreatment None  Some   Marked Sum  \n  Placebo 0.345 0.0833 0.0833 0.512\n  Treated 0.155 0.0833 0.2500 0.488\n  Sum     0.500 0.1667 0.3333 1.000"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#you can use addmargins() to add marginal sums to these tables\naddmargins(mytable)\naddmargins(prop.table(mytable))"
        }, 
        {
            "execution_count": 25, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llll}\n  & None & Some & Marked & Sum\\\\\n\\hline\n\tPlacebo & 0.674 & 0.163 & 0.163 & 1    \\\\\n\tTreated & 0.317 & 0.171 & 0.512 & 1    \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>None</th><th scope=col>Some</th><th scope=col>Marked</th><th scope=col>Sum</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Placebo</th><td>0.674</td><td>0.163</td><td>0.163</td><td>1    </td></tr>\n\t<tr><th scope=row>Treated</th><td>0.317</td><td>0.171</td><td>0.512</td><td>1    </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "         Improved\nTreatment None  Some  Marked Sum\n  Placebo 0.674 0.163 0.163  1  \n  Treated 0.317 0.171 0.512  1  "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#if you only want to add a sum column:\naddmargins(prop.table(mytable, 1), 2)"
        }, 
        {
            "execution_count": 26, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|lll}\n  & None & Some & Marked\\\\\n\\hline\n\tPlacebo & 0.69 & 0.5  & 0.25\\\\\n\tTreated & 0.31 & 0.5  & 0.75\\\\\n\tSum & 1.00 & 1.0  & 1.00\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>None</th><th scope=col>Some</th><th scope=col>Marked</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Placebo</th><td>0.69</td><td>0.5 </td><td>0.25</td></tr>\n\t<tr><th scope=row>Treated</th><td>0.31</td><td>0.5 </td><td>0.75</td></tr>\n\t<tr><th scope=row>Sum</th><td>1.00</td><td>1.0 </td><td>1.00</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "         Improved\nTreatment None Some Marked\n  Placebo 0.69 0.5  0.25  \n  Treated 0.31 0.5  0.75  \n  Sum     1.00 1.0  1.00  "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#if you only want to add a sum row:\naddmargins(prop.table(mytable, 2), 1)"
        }, 
        {
            "execution_count": 27, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Installing package into \u2018/gpfs/global_fs01/sym_shared/YPProdSpark/user/s17c-9f3318fc11f06c-d37a4b9405b6/R/libs\u2019\n(as \u2018lib\u2019 is unspecified)\n"
                }, 
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "\n \n   Cell Contents\n|-------------------------|\n|                       N |\n| Chi-square contribution |\n|           N / Row Total |\n|           N / Col Total |\n|         N / Table Total |\n|-------------------------|\n\n \nTotal Observations in Table:  84 \n\n \n                    | Arthritis$Improved \nArthritis$Treatment |      None |      Some |    Marked | Row Total | \n--------------------|-----------|-----------|-----------|-----------|\n            Placebo |        29 |         7 |         7 |        43 | \n                    |     2.616 |     0.004 |     3.752 |           | \n                    |     0.674 |     0.163 |     0.163 |     0.512 | \n                    |     0.690 |     0.500 |     0.250 |           | \n                    |     0.345 |     0.083 |     0.083 |           | \n--------------------|-----------|-----------|-----------|-----------|\n            Treated |        13 |         7 |        21 |        41 | \n                    |     2.744 |     0.004 |     3.935 |           | \n                    |     0.317 |     0.171 |     0.512 |     0.488 | \n                    |     0.310 |     0.500 |     0.750 |           | \n                    |     0.155 |     0.083 |     0.250 |           | \n--------------------|-----------|-----------|-----------|-----------|\n       Column Total |        42 |        14 |        28 |        84 | \n                    |     0.500 |     0.167 |     0.333 |           | \n--------------------|-----------|-----------|-----------|-----------|\n\n \n"
                }
            ], 
            "metadata": {}, 
            "source": "#a third method of creating two-way tables is the CrossTable() function in the \"gmodels\" package:\ninstall.packages(\"gmodels\")\nlibrary(gmodels)\nCrossTable(Arthritis$Treatment, Arthritis$Improved)"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Multidimensional tables"
        }, 
        {
            "execution_count": 28, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": ", , Improved = None\n\n         Sex\nTreatment Female Male\n  Placebo     19   10\n  Treated      6    7\n\n, , Improved = Some\n\n         Sex\nTreatment Female Male\n  Placebo      7    0\n  Treated      5    2\n\n, , Improved = Marked\n\n         Sex\nTreatment Female Male\n  Placebo      6    1\n  Treated     16    5\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#if you have more than two categorical variables, you're dealing with multidimensional tables\n#here's an example of a three-way contingency table\nmytable <- xtabs(~ Treatment + Sex + Improved, data = Arthritis) #cell frequencies\nmytable"
        }, 
        {
            "execution_count": 29, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "                 Improved None Some Marked\nTreatment Sex                             \nPlacebo   Female            19    7      6\n          Male              10    0      1\nTreated   Female             6    5     16\n          Male               7    2      5"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "ftable(mytable) #presents multidimensional tables in a compact and attractive manner"
        }, 
        {
            "execution_count": 30, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Treatment\nPlacebo Treated \n     43      41 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "margin.table(mytable, 1) #marginal frequencies by row (Treatment)"
        }, 
        {
            "execution_count": 31, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Sex\nFemale   Male \n    59     25 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "margin.table(mytable, 2) #marginal frequencies by row (Gender)"
        }, 
        {
            "execution_count": 32, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Improved\n  None   Some Marked \n    42     14     28 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "margin.table(mytable, 3) #marginal frequencies by column (Improved)"
        }, 
        {
            "execution_count": 33, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "         Improved\nTreatment None Some Marked\n  Placebo   29    7      7\n  Treated   13    7     21"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "margin.table(mytable, c(1, 3)) #treatment X improved marginal frequencies"
        }, 
        {
            "execution_count": 34, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "                 Improved   None   Some Marked\nTreatment Sex                                 \nPlacebo   Female          0.5938 0.2188 0.1875\n          Male            0.9091 0.0000 0.0909\nTreated   Female          0.2222 0.1852 0.5926\n          Male            0.5000 0.1429 0.3571"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "ftable(prop.table(mytable, c(1, 2))) #improved proportions for treatment X sex"
        }, 
        {
            "execution_count": 35, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "                 Improved   None   Some Marked    Sum\nTreatment Sex                                        \nPlacebo   Female          0.5938 0.2188 0.1875 1.0000\n          Male            0.9091 0.0000 0.0909 1.0000\nTreated   Female          0.2222 0.1852 0.5926 1.0000\n          Male            0.5000 0.1429 0.3571 1.0000"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "ftable(addmargins(prop.table(mytable, c(1, 2)), 3)) #improved proportions for treatment X sex with row totals"
        }, 
        {
            "execution_count": 36, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "                 Improved   None   Some Marked    Sum\nTreatment Sex                                        \nPlacebo   Female           59.38  21.88  18.75 100.00\n          Male             90.91   0.00   9.09 100.00\nTreated   Female           22.22  18.52  59.26 100.00\n          Male             50.00  14.29  35.71 100.00"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "ftable(addmargins(prop.table(mytable, c(1, 2)), 3)) * 100 #improved percentages for treatment X sex with row totals"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Tests of independence"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Chi-Square test of independence"
        }, 
        {
            "execution_count": 37, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tPearson's Chi-squared test\n\ndata:  mytable\nX-squared = 10, df = 2, p-value = 0.001\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#you can apply the function chisq.test() to a two-way table to produce a chi-square test of independence\n#of the row and column variables.  example:\nlibrary(vcd) #load the vcd package for the Arthritis dataset\nmytable <- xtabs(~ Treatment + Improved, data = Arthritis) #create a two-way table\nchisq.test(mytable) #run the chi-square test"
        }, 
        {
            "execution_count": 38, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#let us understand the result:\n#let us begin with a hypothesis: treatment type and outcome are independent\n#the p-value is the probability of obtaining the sampled results, assuming independence of\n     #the row and column variables in the population.\n#because p-value < 0.01, you reject the hypothesis that the treatment type is independent from the outcome!"
        }, 
        {
            "execution_count": 39, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Warning message in chisq.test(mytable):\n\u201cChi-squared approximation may be incorrect\u201d"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tPearson's Chi-squared test\n\ndata:  mytable\nX-squared = 5, df = 2, p-value = 0.09\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#another example:\nmytable <- xtabs(~ Improved + Sex, data = Arthritis) #create a two-way table\nchisq.test(mytable) #run the chi-square test"
        }, 
        {
            "execution_count": 40, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#let us understand the result:\n#let us begin with a hypothesis: outcome and sex are independent\n#the p-value is the probability of obtaining the sampled results, assuming independence of\n     #the row and column variables in the population.\n#because p-value > 0.05, you confirm the hypothesis that outcome is independent from sex!"
        }, 
        {
            "execution_count": 41, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#conclusion:\n#in a chi-square test of independence:\n#the hypothesis is that two variables are independent\n#if the p-value < 0.01, you reject the hypothesis\n#if the p-value > 0.05, you confirm the hypothesis"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Fisher's exact test"
        }, 
        {
            "execution_count": 42, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tFisher's Exact Test for Count Data\n\ndata:  mytable\np-value = 0.001\nalternative hypothesis: two.sided\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#Fisher's exact test evaluates the null hypothesis of independence of rows and columns\n#in a contingency table with fixed marginals.  The format is fisher.test(mytable) where\n#mytable is a two-way table.  In contrast to many statistical packages, the fisher.test()\n#function can be applied to any two-way table with two or more rows and columns, not\n#just a 2X2 table.  here's an example of a Fisher's exact test:\nlibrary(vcd) #load the vcd package for the Arthritis dataset\nmytable <- xtabs(~ Treatment + Improved, data = Arthritis) #create the 2-way table\nfisher.test(mytable) #run the test"
        }, 
        {
            "execution_count": 46, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#because p-value is < 0.01, we reject the null hypothesis that Treatment and Improved are independent of each other"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Cochran-Mantel-Haenszel test"
        }, 
        {
            "execution_count": 44, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tCochran-Mantel-Haenszel test\n\ndata:  mytable\nCochran-Mantel-Haenszel M^2 = 10, df = 2, p-value = 7e-04\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#the mantelhaen.test() provides a Cochran-Mantel-Haenszel chi-square test of the null hypothesis\n#that two nominal variables are conditionally independent in each stratum of a third variable.\n#the following code tests the hypothesis that the Treatment and Improved variables are independent\n#within each level for Sex.  The test assumes that there is no three-way (Treatment X Improved X Sex) interaction:\nlibrary(vcd) #load the vcd package for the Arthritis dataset\nmytable <- xtabs(~ Treatment + Improved + Sex, data = Arthritis) #create the 3-way table\nmantelhaen.test(mytable) #run the test"
        }, 
        {
            "execution_count": 45, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#7e-04 = 7 * 10^-4 = 0.0007\n#because p-value is < 0.01, we reject the null hypothesis that Treatment and Improved are independent of each other when controlled for Sex"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Measures of association"
        }, 
        {
            "execution_count": 48, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "                    X^2 df  P(> X^2)\nLikelihood Ratio 13.530  2 0.0011536\nPearson          13.055  2 0.0014626\n\nPhi-Coefficient   : NA \nContingency Coeff.: 0.367 \nCramer's V        : 0.394 "
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#if you reject the null hypothesis of independence of variables using either the chi-square or the Fisher's exact of the CMH test,\n#you try to guage the strength of the dependence or association.\n#the assocstats() function of the vcd package calculates the phi-coefficient, contingency coefficient and Cramer's V for a 2-way table\n#in general, LOWER magnitudes indicate STRONGER associations.  here's an example:\nlibrary(vcd) #load the vcd package for the Arthritis dataset\nmytable <- xtabs(~ Treatment + Improved, data = Arthritis) #create the 2-way table\nassocstats(mytable) #run the test"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## Correlations"
        }, 
        {
            "execution_count": 49, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#Correlation coefficients are used to describe relationships among quantitative variables\n#the sign +- indicates the direction of the relationship (positive or negative)\n#the magnitude indicates strength of the relationship (ranging from 0 for no relationship to 1 for a perfectly predictable relationship)"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Types of correlations"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Pearson, Spearman and Kendall correlations"
        }, 
        {
            "execution_count": 51, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#the Pearson product-moment correlation assesses the degree of linear relationship between 2 quantitative variables\n#Spearman's rank-oder correlation coefficient assesses the degree of relationship between 2 rank-ordered variables\n#Kendall's tau is also a nonparametric measure of rank correlation"
        }, 
        {
            "execution_count": 52, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#the cor() function produces all 3 correlation coefficients whereas the cov() function produces covariances\n#the simplified format is: cor(x, use = , method = );\n#where x = matrix or data frame; use = specifies the handling of missing data (default = everything); method = method (default = \"pearson\")\n#examples follow"
        }, 
        {
            "execution_count": 53, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "states <- state.x77[, 1:6] #choose a subset of the state.x77 dataset; pick columns 1 through 6 and store them in the \"states\" object"
        }, 
        {
            "execution_count": 57, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llllll}\n  & Population & Income & Illiteracy & Life Exp & Murder & HS Grad\\\\\n\\hline\n\tPopulation &  1.0000 &  0.208  &  0.108  & -0.0681 &  0.344  & -0.0985\\\\\n\tIncome &  0.2082 &  1.000  & -0.437  &  0.3403 & -0.230  &  0.6199\\\\\n\tIlliteracy &  0.1076 & -0.437  &  1.000  & -0.5885 &  0.703  & -0.6572\\\\\n\tLife Exp & -0.0681 &  0.340  & -0.588  &  1.0000 & -0.781  &  0.5822\\\\\n\tMurder &  0.3436 & -0.230  &  0.703  & -0.7808 &  1.000  & -0.4880\\\\\n\tHS Grad & -0.0985 &  0.620  & -0.657  &  0.5822 & -0.488  &  1.0000\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>Population</th><th scope=col>Income</th><th scope=col>Illiteracy</th><th scope=col>Life Exp</th><th scope=col>Murder</th><th scope=col>HS Grad</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Population</th><td> 1.0000</td><td> 0.208 </td><td> 0.108 </td><td>-0.0681</td><td> 0.344 </td><td>-0.0985</td></tr>\n\t<tr><th scope=row>Income</th><td> 0.2082</td><td> 1.000 </td><td>-0.437 </td><td> 0.3403</td><td>-0.230 </td><td> 0.6199</td></tr>\n\t<tr><th scope=row>Illiteracy</th><td> 0.1076</td><td>-0.437 </td><td> 1.000 </td><td>-0.5885</td><td> 0.703 </td><td>-0.6572</td></tr>\n\t<tr><th scope=row>Life Exp</th><td>-0.0681</td><td> 0.340 </td><td>-0.588 </td><td> 1.0000</td><td>-0.781 </td><td> 0.5822</td></tr>\n\t<tr><th scope=row>Murder</th><td> 0.3436</td><td>-0.230 </td><td> 0.703 </td><td>-0.7808</td><td> 1.000 </td><td>-0.4880</td></tr>\n\t<tr><th scope=row>HS Grad</th><td>-0.0985</td><td> 0.620 </td><td>-0.657 </td><td> 0.5822</td><td>-0.488 </td><td> 1.0000</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "           Population Income Illiteracy Life Exp Murder HS Grad\nPopulation  1.0000     0.208  0.108     -0.0681   0.344 -0.0985\nIncome      0.2082     1.000 -0.437      0.3403  -0.230  0.6199\nIlliteracy  0.1076    -0.437  1.000     -0.5885   0.703 -0.6572\nLife Exp   -0.0681     0.340 -0.588      1.0000  -0.781  0.5822\nMurder      0.3436    -0.230  0.703     -0.7808   1.000 -0.4880\nHS Grad    -0.0985     0.620 -0.657      0.5822  -0.488  1.0000", 
                        "text/markdown": "1. 1\n2. 0.208227557479216\n3. 0.107622373394733\n4. -0.0680519520843897\n5. 0.343642750812065\n6. -0.0984897481740015\n7. 0.208227557479216\n8. 1\n9. -0.437075185597417\n10. 0.340255338936367\n11. -0.230077610259372\n12. 0.619932323134002\n13. 0.107622373394733\n14. -0.437075185597417\n15. 1\n16. -0.588477925579257\n17. 0.70297519868417\n18. -0.657188609438515\n19. -0.0680519520843897\n20. 0.340255338936367\n21. -0.588477925579257\n22. 1\n23. -0.780845752227159\n24. 0.582216203903882\n25. 0.343642750812065\n26. -0.230077610259372\n27. 0.70297519868417\n28. -0.780845752227159\n29. 1\n30. -0.487971022334675\n31. -0.0984897481740015\n32. 0.619932323134002\n33. -0.657188609438515\n34. 0.582216203903882\n35. -0.487971022334675\n36. 1\n\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "cor(states) #uses defaults for use (everything) and method (pearson)"
        }, 
        {
            "execution_count": 58, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|llllll}\n  & Population & Income & Illiteracy & Life Exp & Murder & HS Grad\\\\\n\\hline\n\tPopulation &  1.000 &  0.125 &  0.313 & -0.104 &  0.346 & -0.383\\\\\n\tIncome &  0.125 &  1.000 & -0.315 &  0.324 & -0.217 &  0.510\\\\\n\tIlliteracy &  0.313 & -0.315 &  1.000 & -0.555 &  0.672 & -0.655\\\\\n\tLife Exp & -0.104 &  0.324 & -0.555 &  1.000 & -0.780 &  0.524\\\\\n\tMurder &  0.346 & -0.217 &  0.672 & -0.780 &  1.000 & -0.437\\\\\n\tHS Grad & -0.383 &  0.510 & -0.655 &  0.524 & -0.437 &  1.000\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>Population</th><th scope=col>Income</th><th scope=col>Illiteracy</th><th scope=col>Life Exp</th><th scope=col>Murder</th><th scope=col>HS Grad</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Population</th><td> 1.000</td><td> 0.125</td><td> 0.313</td><td>-0.104</td><td> 0.346</td><td>-0.383</td></tr>\n\t<tr><th scope=row>Income</th><td> 0.125</td><td> 1.000</td><td>-0.315</td><td> 0.324</td><td>-0.217</td><td> 0.510</td></tr>\n\t<tr><th scope=row>Illiteracy</th><td> 0.313</td><td>-0.315</td><td> 1.000</td><td>-0.555</td><td> 0.672</td><td>-0.655</td></tr>\n\t<tr><th scope=row>Life Exp</th><td>-0.104</td><td> 0.324</td><td>-0.555</td><td> 1.000</td><td>-0.780</td><td> 0.524</td></tr>\n\t<tr><th scope=row>Murder</th><td> 0.346</td><td>-0.217</td><td> 0.672</td><td>-0.780</td><td> 1.000</td><td>-0.437</td></tr>\n\t<tr><th scope=row>HS Grad</th><td>-0.383</td><td> 0.510</td><td>-0.655</td><td> 0.524</td><td>-0.437</td><td> 1.000</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "           Population Income Illiteracy Life Exp Murder HS Grad\nPopulation  1.000      0.125  0.313     -0.104    0.346 -0.383 \nIncome      0.125      1.000 -0.315      0.324   -0.217  0.510 \nIlliteracy  0.313     -0.315  1.000     -0.555    0.672 -0.655 \nLife Exp   -0.104      0.324 -0.555      1.000   -0.780  0.524 \nMurder      0.346     -0.217  0.672     -0.780    1.000 -0.437 \nHS Grad    -0.383      0.510 -0.655      0.524   -0.437  1.000 ", 
                        "text/markdown": "1. 1\n2. 0.124609843937575\n3. 0.313049606602816\n4. -0.104017096339212\n5. 0.345740086193474\n6. -0.383364949250197\n7. 0.124609843937575\n8. 1\n9. -0.314594815211684\n10. 0.324104978390277\n11. -0.217462301748027\n12. 0.510480948331404\n13. 0.313049606602816\n14. -0.314594815211684\n15. 1\n16. -0.555373492029757\n17. 0.672359185839312\n18. -0.654539568798863\n19. -0.104017096339212\n20. 0.324104978390277\n21. -0.555373492029757\n22. 1\n23. -0.780240630251865\n24. 0.523941023917011\n25. 0.345740086193474\n26. -0.217462301748027\n27. 0.672359185839312\n28. -0.780240630251865\n29. 1\n30. -0.436733028678376\n31. -0.383364949250197\n32. 0.510480948331404\n33. -0.654539568798863\n34. 0.523941023917011\n35. -0.436733028678376\n36. 1\n\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "cor(states, method = \"spearman\") #uses defaults for use (everything)"
        }, 
        {
            "execution_count": 59, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|ll}\n  & Life Exp & Murder\\\\\n\\hline\n\tPopulation & -0.0681 &  0.344 \\\\\n\tIncome &  0.3403 & -0.230 \\\\\n\tIlliteracy & -0.5885 &  0.703 \\\\\n\tHS Grad &  0.5822 & -0.488 \\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>Life Exp</th><th scope=col>Murder</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>Population</th><td>-0.0681</td><td> 0.344 </td></tr>\n\t<tr><th scope=row>Income</th><td> 0.3403</td><td>-0.230 </td></tr>\n\t<tr><th scope=row>Illiteracy</th><td>-0.5885</td><td> 0.703 </td></tr>\n\t<tr><th scope=row>HS Grad</th><td> 0.5822</td><td>-0.488 </td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "           Life Exp Murder\nPopulation -0.0681   0.344\nIncome      0.3403  -0.230\nIlliteracy -0.5885   0.703\nHS Grad     0.5822  -0.488", 
                        "text/markdown": "1. -0.0680519520843897\n2. 0.340255338936367\n3. -0.588477925579257\n4. 0.582216203903882\n5. 0.343642750812065\n6. -0.230077610259372\n7. 0.70297519868417\n8. -0.487971022334675\n\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#you get square matrices by default (i.e. all variables crossed with all other variables)\n#you can also produce non-square variables as follows:\nx <- states[, c(\"Population\", \"Income\", \"Illiteracy\", \"HS Grad\")]\ny <- states[, c(\"Life Exp\", \"Murder\")]\ncor(x, y)"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "#### Partial correlations"
        }, 
        {
            "execution_count": 62, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Installing package into \u2018/gpfs/global_fs01/sym_shared/YPProdSpark/user/s17c-9f3318fc11f06c-d37a4b9405b6/R/libs\u2019\n(as \u2018lib\u2019 is unspecified)\nalso installing the dependencies \u2018irlba\u2019, \u2018igraph\u2019\n\nLoading required package: igraph\n\nAttaching package: \u2018igraph\u2019\n\nThe following object is masked from \u2018package:SparkR\u2019:\n\n    union\n\nThe following objects are masked from \u2018package:stats\u2019:\n\n    decompose, spectrum\n\nThe following object is masked from \u2018package:base\u2019:\n\n    union\n\n\nAttaching package: \u2018ggm\u2019\n\nThe following object is masked from \u2018package:igraph\u2019:\n\n    pa\n\nThe following object is masked from \u2018package:Hmisc\u2019:\n\n    rcorr\n\n"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{enumerate*}\n\\item 'Population'\n\\item 'Income'\n\\item 'Illiteracy'\n\\item 'Life Exp'\n\\item 'Murder'\n\\item 'HS Grad'\n\\end{enumerate*}\n", 
                        "text/html": "<ol class=list-inline>\n\t<li>'Population'</li>\n\t<li>'Income'</li>\n\t<li>'Illiteracy'</li>\n\t<li>'Life Exp'</li>\n\t<li>'Murder'</li>\n\t<li>'HS Grad'</li>\n</ol>\n", 
                        "text/plain": "[1] \"Population\" \"Income\"     \"Illiteracy\" \"Life Exp\"   \"Murder\"    \n[6] \"HS Grad\"   ", 
                        "text/markdown": "1. 'Population'\n2. 'Income'\n3. 'Illiteracy'\n4. 'Life Exp'\n5. 'Murder'\n6. 'HS Grad'\n\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#a partial correlation is a correlation between 2 quantitative variables, controlling for one or more quantitative variables\n#the pcor() function in the ggm package provides partial correlation coefficients.  here's an example:\ninstall.packages(\"ggm\") #install the ggm package\nlibrary(ggm) #load the ggm package\ncolnames(states) #names of the columns of the states.x77 dataset in R"
        }, 
        {
            "execution_count": 69, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "0.346272371372947", 
                        "text/html": "0.346272371372947", 
                        "text/plain": "[1] 0.346", 
                        "text/markdown": "0.346272371372947"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#the form of the pcor() function is as follows:\n#pcor(u, S) where u is a vector of numbers, with the first two numbers being the indices of the numbers to be correlated\n#and the remaining numbers being the indices of the conditioning variables\n#S is the covariance matrix among the variables\npcor(c(1, 5, 2, 3, 6), stats::cov(states)) #here, 1 = Population and 5 = Murder are the 2 variables to be correlated\n                                    #Income, Illiteracy and HS Grad are the conditioning variables\n                                    #cov(states) is the covariance matrix among variables\n                                    #to get the cov() function to work, we specify the package name by using stats::cov(dataset)"
        }, 
        {
            "execution_count": 71, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#in this case 0.346 is the correlation between population and murder rate controlling for:\n#income, illiteracy and hs graduation rate"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "markdown", 
            "source": "#### Other types of correlations"
        }, 
        {
            "execution_count": 65, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#the hatcor() function in the polycor package can compute a heterogeneous correlation matrix containing\n#Pearson product-moment correlations between numeric variables,\n#polyserial correlations between numeric and ordinal variables,\n#polychoric correlations between ordinal variables, and\n#tetrachoric correlations between two dichotomous variables.\n#Polyserial, polychoric and tetrachoric correlations assume that the ordinal or dichotomous variables\n#are derived from underlying normal distributions."
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Testing correlations for significance"
        }, 
        {
            "execution_count": 72, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tPearson's product-moment correlation\n\ndata:  states[, 3] and states[, 5]\nt = 7, df = 50, p-value = 1e-08\nalternative hypothesis: true correlation is not equal to 0\n95 percent confidence interval:\n 0.528 0.821\nsample estimates:\n  cor \n0.703 \n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#you can use the cor.test() function to test an individual correlation coefficient\n#the format is: cor.test(x, y, alternative = , method = ), where\n#x and y are the variables to be correlated\n#alternative could be \"two.side\" (default; population correlation isn't equal to 0); \"less\" (population correlation < 0); or\n                      #\"greater\" (population correlation > 0)\n#method could be either \"pearson\" (default), \"kendall\" or \"spearman\"\n#here's an example:\ncor.test(states[, 3], states[, 5]) #testing the correlation between illiteracy and murder rate\n                                   #the defaults are: alternative = \"two.side\" and method = \"pearson\""
        }, 
        {
            "execution_count": 74, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#the code above tests that the Pearson correlation between illiteracy and murder rate is 0.\n#if the population correlation between these variables was in fact 0, then you would expect to see\n#a sample correlation as large as 0.703 less than 1 time out of 10 million (i.e. p-value = 1e-08 or 1 / 10 ^ 8)\n#since this is unlikely, you reject the null hypothesis that the correlation between illiteracy and murder rate is 0,\n#and accept the alternative hypothesis that the correlation between illiteracy and murder rate is NOT 0."
        }, 
        {
            "execution_count": 73, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "Call:corr.test(x = states, use = \"complete\")\nCorrelation matrix \n           Population Income Illiteracy Life Exp Murder HS Grad\nPopulation       1.00   0.21       0.11    -0.07   0.34   -0.10\nIncome           0.21   1.00      -0.44     0.34  -0.23    0.62\nIlliteracy       0.11  -0.44       1.00    -0.59   0.70   -0.66\nLife Exp        -0.07   0.34      -0.59     1.00  -0.78    0.58\nMurder           0.34  -0.23       0.70    -0.78   1.00   -0.49\nHS Grad         -0.10   0.62      -0.66     0.58  -0.49    1.00\nSample Size \n[1] 50\nProbability values (Entries above the diagonal are adjusted for multiple tests.) \n           Population Income Illiteracy Life Exp Murder HS Grad\nPopulation       0.00   0.59       1.00      1.0   0.10       1\nIncome           0.15   0.00       0.01      0.1   0.54       0\nIlliteracy       0.46   0.00       0.00      0.0   0.00       0\nLife Exp         0.64   0.02       0.00      0.0   0.00       0\nMurder           0.01   0.11       0.00      0.0   0.00       0\nHS Grad          0.50   0.00       0.00      0.0   0.00       0\n\n To see confidence intervals of the correlations, print with the short=FALSE option"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#using cor.test(), you can only test one correlation at a time\n#the corr.test() function in the \"psych\" package allows you to go further.  here's an example:\nlibrary(psych) #load the psych library\ncorr.test(states, use = \"complete\") #\"complete\" = listwise deletion of missing values"
        }, 
        {
            "execution_count": 75, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#in the above code, you can see that the correlation between population size and HS grad rate (-0.10)\n#is not significantly different from 0 (p = 0.50)."
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "## T-tests"
        }, 
        {
            "execution_count": 76, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#t-tests help in the comparison of two groups"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Independent t-test"
        }, 
        {
            "execution_count": 78, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tWelch Two Sample t-test\n\ndata:  Prob by So\nt = -4, df = 20, p-value = 7e-04\nalternative hypothesis: true difference in means is not equal to 0\n95 percent confidence interval:\n -0.0385 -0.0119\nsample estimates:\nmean in group 0 mean in group 1 \n         0.0385          0.0637 \n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#are you more likely to be imprisoned if you commit a crime in the South of the US?\n#the comparison of interest is Southern versus non-Southers states,\n#and the dependent variable is the probablity of incarceration.\n#a two-group independent t-test can be used to test the hypothesis that the two population means are equal\n#we will use the UScrime dataset from the MASS package to test this hypothesis\nlibrary(MASS) #attach the MASS package\nt.test(Prob ~ So, data = UScrime) #Prob = probability of incarceration; So = indicator variable for southern states"
        }, 
        {
            "execution_count": 79, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#from the results above, you can reject the null hypothesis that Southern states and non-Southern states,\n#have equal probabilities of imprisonment (p < 0.001)"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Dependent t-test"
        }, 
        {
            "execution_count": 80, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{tabular}{r|ll}\n  & U1 & U2\\\\\n\\hline\n\tmean & 95.5  & 33.98\\\\\n\tsd & 18.0  &  8.45\\\\\n\\end{tabular}\n", 
                        "text/html": "<table>\n<thead><tr><th></th><th scope=col>U1</th><th scope=col>U2</th></tr></thead>\n<tbody>\n\t<tr><th scope=row>mean</th><td>95.5 </td><td>33.98</td></tr>\n\t<tr><th scope=row>sd</th><td>18.0 </td><td> 8.45</td></tr>\n</tbody>\n</table>\n", 
                        "text/plain": "     U1   U2   \nmean 95.5 33.98\nsd   18.0  8.45", 
                        "text/markdown": "1. 95.468085106383\n2. 18.0287826204436\n3. 33.9787234042553\n4. 8.44544992417998\n\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#is the unemployment rate for younger males (14 - 24) > for older males (35 - 39)?\n#in this case, the 2 groups are not independent since you would not expect the unemployment\n#rate for younger and older males in Alabama to be unrelated\n#here's an example:\nlibrary(MASS) #attach the MASS package\nsapply(UScrime[c(\"U1\", \"U2\")], function(x)(c(mean = mean(x), sd = sd(x)))) #use sapply() to calculate mean and sd for the 2 groups"
        }, 
        {
            "execution_count": 81, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tPaired t-test\n\ndata:  U1 and U2\nt = 30, df = 50, p-value <2e-16\nalternative hypothesis: true difference in means is not equal to 0\n95 percent confidence interval:\n 57.7 65.3\nsample estimates:\nmean of the differences \n                   61.5 \n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "with(UScrime, t.test(U1, U2, paired = T))"
        }, 
        {
            "execution_count": 82, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#the mean difference (61.5) is large enough to warrant rejection of the hypothesis that the mean\n#unemployent rate for older and younger males is the same.  younger males have a higher rate.\n#the probability of obtaining a sample difference this large IF the population means are equal is,\n#less than 0.0000000000000002 (i.e. p-value of 2e-16 or 2 * 10 ^ -16)"
        }, 
        {
            "metadata": {
                "collapsed": true
            }, 
            "cell_type": "markdown", 
            "source": "## Nonparametric tests of group differences"
        }, 
        {
            "execution_count": 83, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#if outcome variables are severely skewed or ordinal in nature, you may wish to use nonparametric tests"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Comparing two groups"
        }, 
        {
            "execution_count": 84, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "So: 0\n[1] 0.0382\n------------------------------------------------------------ \nSo: 1\n[1] 0.0556"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#if two groups are independent, you can use the Wilcoxon rank sum test (aka Mann-Whitney U test)\n#here's an example:\nlibrary(MASS) #load the MASS package\nwith(UScrime, by(Prob, So, median)) #calculate median probabilities of incarceration rates by Southern v/s non-Southern states"
        }, 
        {
            "execution_count": 85, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tWilcoxon rank sum test\n\ndata:  Prob by So\nW = 80, p-value = 8e-05\nalternative hypothesis: true location shift is not equal to 0\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "wilcox.test(Prob ~ So, data = UScrime) #run the test"
        }, 
        {
            "execution_count": 86, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#you can reject the null hypothesis that incarceration rates are the same in\n#Southern and non-Southern states (since p-value < 0.001; 8e-05 = 8 * 10 ^ -5)"
        }, 
        {
            "execution_count": 87, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/latex": "\\begin{description*}\n\\item[U1] 92\n\\item[U2] 34\n\\end{description*}\n", 
                        "text/html": "<dl class=dl-horizontal>\n\t<dt>U1</dt>\n\t\t<dd>92</dd>\n\t<dt>U2</dt>\n\t\t<dd>34</dd>\n</dl>\n", 
                        "text/plain": "U1 U2 \n92 34 ", 
                        "text/markdown": "U1\n:   92U2\n:   34\n\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#if the two groups are paired, you can apply the same test but with the paired = TRUE option\n#here's an example:\nsapply(UScrime[c(\"U1\", \"U2\")], median) #calculate the median unemployment rate for young and old male age groups"
        }, 
        {
            "execution_count": 88, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stderr", 
                    "text": "Warning message in wilcox.test.default(U1, U2, paired = T):\n\u201ccannot compute exact p-value with ties\u201d"
                }, 
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tWilcoxon signed rank test with continuity correction\n\ndata:  U1 and U2\nV = 1000, p-value = 2e-09\nalternative hypothesis: true location shift is not equal to 0\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "with(UScrime, wilcox.test(U1, U2, paired = T)) #run the test with paired = T option"
        }, 
        {
            "execution_count": 89, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#again, we can reject the null hypothesis since the p-value < 0.001"
        }, 
        {
            "metadata": {}, 
            "cell_type": "markdown", 
            "source": "### Comparing more than two groups"
        }, 
        {
            "execution_count": 90, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#if the groups are independent, you can use the Kruskal-Wallis test;\n#if the groups are dependent, you can use the Friedman test"
        }, 
        {
            "execution_count": 91, 
            "cell_type": "code", 
            "outputs": [
                {
                    "metadata": {}, 
                    "data": {
                        "text/plain": "\n\tKruskal-Wallis rank sum test\n\ndata:  Illiteracy by state.region\nKruskal-Wallis chi-squared = 20, df = 3, p-value = 5e-05\n"
                    }, 
                    "output_type": "display_data"
                }
            ], 
            "metadata": {}, 
            "source": "#here's an example of a Kruskal-Wallis test\nstates <- data.frame(state.region, state.x77) #add the region designation to the dataset\nkruskal.test(Illiteracy ~ state.region, data = states) #apply the test"
        }, 
        {
            "execution_count": 92, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#this significance test suggests that the illiteracy rate isn't the same in each of the four\n#regions of the country (p-value < 0.001; p-value = 5e-05 = 5 * 10 ^ -5)"
        }, 
        {
            "execution_count": 94, 
            "cell_type": "code", 
            "outputs": [
                {
                    "output_type": "stream", 
                    "name": "stdout", 
                    "text": "Descriptive Statistics\n\n         West North Central Northeast  South\nn      13.000        12.000     9.000 16.000\nmedian  0.600         0.700     1.100  1.750\nmad     0.148         0.148     0.297  0.593\n\nMultiple Comparisons (Wilcoxon Rank Sum Tests)\nProbability Adjustment = holm\n\n        Group.1       Group.2    W        p    \n1          West North Central 88.0 8.67e-01    \n2          West     Northeast 46.5 8.67e-01    \n3          West         South 39.0 1.79e-02   *\n4 North Central     Northeast 20.5 5.36e-02   .\n5 North Central         South  2.0 8.05e-05 ***\n6     Northeast         South 18.0 1.19e-02   *\n---\nSignif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n"
                }
            ], 
            "metadata": {}, 
            "source": "#nonparametric multiple comparisons using the wmc() function\nsource(\"http://www.statmethods.net/RiA/wmc.txt\") #access the function\nstates <- data.frame(state.region, state.x77) #create the data frame\nwmc(Illiteracy ~ state.region, data = states, method = \"holm\") #run the test"
        }, 
        {
            "execution_count": 95, 
            "cell_type": "code", 
            "outputs": [], 
            "metadata": {
                "collapsed": true
            }, 
            "source": "#you can see from the test results that the South differs significantly\n#from the other three regions and that the other three regions do not\n#differ from each other at a p < 0.05 level."
        }
    ], 
    "nbformat": 4, 
    "nbformat_minor": 1
 }
No results found