#slide 181-187

# A data frame with 32 observations on 11 variables.
# 
# [, 1]  mpg     Miles/(US) gallon
# [, 2]  cyl     Number of cylinders
# [, 3]  disp    Displacement (cu.in.)
# [, 4]  hp  Gross horsepower
# [, 5]  drat    Rear axle ratio
# [, 6]  wt  Weight (1000 lbs)
# [, 7]  qsec    1/4 mile time
# [, 8]  vs  V/S
# [, 9]  am  Transmission (0 = automatic, 1 = manual)
# [,10]  gear    Number of forward gears
# [,11]  carb    Number of carburetors
dim(mtcars)
## [1] 32 11
par(mfrow=c(3,4))
sapply(mtcars,hist)
##          mpg       cyl       disp       hp        drat      wt       
## breaks   Numeric,6 Numeric,9 Numeric,10 Numeric,7 Numeric,6 Numeric,9
## counts   Integer,5 Integer,8 Integer,9  Integer,6 Integer,5 Integer,8
## density  Numeric,5 Numeric,8 Numeric,9  Numeric,6 Numeric,5 Numeric,8
## mids     Numeric,5 Numeric,8 Numeric,9  Numeric,6 Numeric,5 Numeric,8
## xname    "X[[i]]"  "X[[i]]"  "X[[i]]"   "X[[i]]"  "X[[i]]"  "X[[i]]" 
## equidist TRUE      TRUE      TRUE       TRUE      TRUE      TRUE     
##          qsec       vs        am        gear      carb     
## breaks   Numeric,10 Numeric,6 Numeric,6 Numeric,5 Numeric,8
## counts   Integer,9  Integer,5 Integer,5 Integer,4 Integer,7
## density  Numeric,9  Numeric,5 Numeric,5 Numeric,4 Numeric,7
## mids     Numeric,9  Numeric,5 Numeric,5 Numeric,4 Numeric,7
## xname    "X[[i]]"   "X[[i]]"  "X[[i]]"  "X[[i]]"  "X[[i]]" 
## equidist TRUE       TRUE      TRUE      TRUE      TRUE
sapply(mtcars,boxplot)

##       mpg       cyl       disp      hp        drat      wt       
## stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
## n     32        32        32        32        32        32       
## conf  Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
## out   Numeric,0 Numeric,0 Numeric,0 335       Numeric,0 Numeric,2
## group Numeric,0 Numeric,0 Numeric,0 1         Numeric,0 Numeric,2
## names "1"       "1"       "1"       "1"       "1"       "1"      
##       qsec      vs        am        gear      carb     
## stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
## n     32        32        32        32        32       
## conf  Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
## out   22.9      Numeric,0 Numeric,0 Numeric,0 8        
## group 1         Numeric,0 Numeric,0 Numeric,0 1        
## names "1"       "1"       "1"       "1"       "1"
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}


sapply(mtcars,min)
##    mpg    cyl   disp     hp   drat     wt   qsec     vs     am   gear 
## 10.400  4.000 71.100 52.000  2.760  1.513 14.500  0.000  0.000  3.000 
##   carb 
##  1.000
sapply(mtcars,mean)
##        mpg        cyl       disp         hp       drat         wt 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250 
##       qsec         vs         am       gear       carb 
##  17.848750   0.437500   0.406250   3.687500   2.812500
sapply(mtcars,median)
##     mpg     cyl    disp      hp    drat      wt    qsec      vs      am 
##  19.200   6.000 196.300 123.000   3.695   3.325  17.710   0.000   0.000 
##    gear    carb 
##   4.000   2.000
sapply(mtcars,max)
##     mpg     cyl    disp      hp    drat      wt    qsec      vs      am 
##  33.900   8.000 472.000 335.000   4.930   5.424  22.900   1.000   1.000 
##    gear    carb 
##   5.000   8.000
sapply(mtcars,sd)
##         mpg         cyl        disp          hp        drat          wt 
##   6.0269481   1.7859216 123.9386938  68.5628685   0.5346787   0.9784574 
##        qsec          vs          am        gear        carb 
##   1.7869432   0.5040161   0.4989909   0.7378041   1.6152000
sapply(mtcars,IQR)
##       mpg       cyl      disp        hp      drat        wt      qsec 
##   7.37500   4.00000 205.17500  83.50000   0.84000   1.02875   2.00750 
##        vs        am      gear      carb 
##   1.00000   1.00000   1.00000   2.00000
sapply(mtcars,var)
##          mpg          cyl         disp           hp         drat 
## 3.632410e+01 3.189516e+00 1.536080e+04 4.700867e+03 2.858814e-01 
##           wt         qsec           vs           am         gear 
## 9.573790e-01 3.193166e+00 2.540323e-01 2.489919e-01 5.443548e-01 
##         carb 
## 2.608871e+00
sapply(mtcars,quantile)
##         mpg cyl    disp    hp  drat      wt    qsec vs am gear carb
## 0%   10.400   4  71.100  52.0 2.760 1.51300 14.5000  0  0    3    1
## 25%  15.425   4 120.825  96.5 3.080 2.58125 16.8925  0  0    3    2
## 50%  19.200   6 196.300 123.0 3.695 3.32500 17.7100  0  0    4    2
## 75%  22.800   8 326.000 180.0 3.920 3.61000 18.9000  1  1    4    4
## 100% 33.900   8 472.000 335.0 4.930 5.42400 22.9000  1  1    5    8
sapply(mtcars,Mode)
##    mpg    cyl   disp     hp   drat     wt   qsec     vs     am   gear 
##  21.00   8.00 275.80 110.00   3.92   3.44  17.02   0.00   0.00   3.00 
##   carb 
##   4.00
sapply(mtcars,range)
##       mpg cyl  disp  hp drat    wt qsec vs am gear carb
## [1,] 10.4   4  71.1  52 2.76 1.513 14.5  0  0    3    1
## [2,] 33.9   8 472.0 335 4.93 5.424 22.9  1  1    5    8
newp=function(x){
par(mfrow=c(1,3))
hist(x, breaks = 10,col = heat.colors(5),main=print(names(x)))
boxplot(x,col = topo.colors(5),main=print(names(x)))
print(names(x))
}
newp(mtcars$gear)

## NULL
## NULL
## NULL
sapply(mtcars,newp)

## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL
## NULL
## $mpg
## NULL
## 
## $cyl
## NULL
## 
## $disp
## NULL
## 
## $hp
## NULL
## 
## $drat
## NULL
## 
## $wt
## NULL
## 
## $qsec
## NULL
## 
## $vs
## NULL
## 
## $am
## NULL
## 
## $gear
## NULL
## 
## $carb
## NULL
par(mfrow=c(3,4))

names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
newx=function(x){
plot(x,mtcars$mpg,col=rainbow(7))
}
sapply(mtcars,newx)
## $mpg
## NULL
## 
## $cyl
## NULL
## 
## $disp
## NULL
## 
## $hp
## NULL
## 
## $drat
## NULL
## 
## $wt
## NULL
## 
## $qsec
## NULL
## 
## $vs
## NULL
## 
## $am
## NULL
## 
## $gear
## NULL
## 
## $carb
## NULL
attach(mtcars)
unique(cyl)
## [1] 6 4 8
par(mfrow=c(3,4))

newb=function(x){
  boxplot(mtcars$mpg~x,col=rainbow(7))
}
sapply(mtcars,newb)
##       mpg          cyl         disp         hp           drat        
## stats Numeric,125  Numeric,15  Numeric,135  Numeric,110  Numeric,110 
## n     Numeric,25   Numeric,3   Numeric,27   Numeric,22   Numeric,22  
## conf  Numeric,50   Numeric,6   Numeric,54   Numeric,44   Numeric,44  
## out   Numeric,0    Numeric,2   Numeric,0    Numeric,0    Numeric,0   
## group Numeric,0    Numeric,2   Numeric,0    Numeric,0    Numeric,0   
## names Character,25 Character,3 Character,27 Character,22 Character,22
##       wt           qsec         vs          am          gear       
## stats Numeric,145  Numeric,150  Numeric,10  Numeric,10  Numeric,15 
## n     Numeric,29   Numeric,30   Numeric,2   Numeric,2   Numeric,3  
## conf  Numeric,58   Numeric,60   Numeric,4   Numeric,4   Numeric,6  
## out   Numeric,0    Numeric,0    26          Numeric,0   Numeric,0  
## group Numeric,0    Numeric,0    1           Numeric,0   Numeric,0  
## names Character,29 Character,30 Character,2 Character,2 Character,3
##       carb       
## stats Numeric,30 
## n     Numeric,6  
## conf  Numeric,12 
## out   Numeric,0  
## group Numeric,0  
## names Character,6
par(mfrow=c(1,2))

boxplot(cyl~mtcars$mpg,col=rainbow(7))

boxplot(mtcars$mpg~cyl,col=rainbow(7))

a=table(gear,cyl)
a
##     cyl
## gear  4  6  8
##    3  1  2 12
##    4  8  4  0
##    5  2  1  2
chisq.test(a)
## Warning in chisq.test(a): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  a
## X-squared = 18.036, df = 4, p-value = 0.001214
#?mtcars
#install.packages("vcd")
library(vcd)
## Loading required package: grid

mosaic(a)

library(RColorBrewer)
counts=table(vs,gear)
counts
##    gear
## vs   3  4  5
##   0 12  2  4
##   1  3 10  1
mosaic(counts)

barplot(counts, legend=rownames(counts),col = brewer.pal(3,"Set1"))
barplot(a, legend=rownames(a),col = brewer.pal(3,"Greens"))

chisq.test(counts)
## Warning in chisq.test(counts): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  counts
## X-squared = 12.224, df = 2, p-value = 0.002216
#chisquare test
#http://www.r-tutor.com/elementary-statistics/goodness-fit/chi-squared-test-independence

library(MASS)
str(survey)
## 'data.frame':    237 obs. of  12 variables:
##  $ Sex   : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
##  $ Wr.Hnd: num  18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
##  $ NW.Hnd: num  18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
##  $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Fold  : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
##  $ Pulse : int  92 104 87 NA 35 64 83 74 72 90 ...
##  $ Clap  : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
##  $ Exer  : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
##  $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ Height: num  173 178 NA 160 165 ...
##  $ M.I   : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
##  $ Age   : num  18.2 17.6 16.9 20.3 23.7 ...
tb1=table(survey$Smoke,survey$Exer)
tb1
##        
##         Freq None Some
##   Heavy    7    1    3
##   Never   87   18   84
##   Occas   12    3    4
##   Regul    9    1    7
#mosaic(tb1)

chisq.test(tb1)
## Warning in chisq.test(tb1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tb1
## X-squared = 5.4885, df = 6, p-value = 0.4828
#As the p-value 0.4828 is greater than the .05 significance level, we do not reject
#the null hypothesis that the smoking habit is independent of the exercise level of the students.
#Test the hypothesis whether the students smoking habit is 
#independent of their exercise level at .05 significance level.
# 
# > library(readr)
# > train <- read_csv("C:/Users/AH0158691/Downloads/train.csv")
# Parsed with column specification:
#   cols(
#     PassengerId = col_integer(),
#     Survived = col_integer(),
#     Pclass = col_integer(),
#     Name = col_character(),
#     Sex = col_character(),
#     Age = col_double(),
#     SibSp = col_integer(),
#     Parch = col_integer(),
#     Ticket = col_character(),
#     Fare = col_double(),
#     Cabin = col_character(),
#     Embarked = col_character()
#   )
# > View(train)
getwd()
## [1] "C:/Users/AH0158691/Documents"
train <- read.csv("C:/Users/AH0158691/Downloads/train.csv")

str(train)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
head(train$Cabin,10)
##  [1]      C85       C123           E46                
## 148 Levels:  A10 A14 A16 A19 A20 A23 A24 A26 A31 A32 A34 A36 A5 A6 ... T
summary(train)
##   PassengerId       Survived          Pclass     
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :446.0   Median :0.0000   Median :3.000  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000  
##                                                  
##                                     Name         Sex           Age       
##  Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
##  Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
##  Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
##  Abelson, Mr. Samuel                  :  1                Mean   :29.70  
##  Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
##  Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
##  (Other)                              :885                NA's   :177    
##      SibSp           Parch             Ticket         Fare       
##  Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91  
##  Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45  
##  Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20  
##  3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00  
##  Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33  
##                                   (Other) :852                   
##          Cabin     Embarked
##             :687    :  2   
##  B96 B98    :  4   C:168   
##  C23 C25 C27:  4   Q: 77   
##  G6         :  4   S:644   
##  C22 C26    :  3           
##  D          :  3           
##  (Other)    :186
table(is.na(train$Cabin))
## 
## FALSE 
##   891
table(is.na(train$Cabin))
## 
## FALSE 
##   891
table(is.na(train$Pclass))
## 
## FALSE 
##   891
table(is.na(train$Embarked))
## 
## FALSE 
##   891
table(train$Cabin)
## 
##                             A10             A14             A16 
##             687               1               1               1 
##             A19             A20             A23             A24 
##               1               1               1               1 
##             A26             A31             A32             A34 
##               1               1               1               1 
##             A36              A5              A6              A7 
##               1               1               1               1 
##            B101            B102             B18             B19 
##               1               1               2               1 
##             B20             B22             B28              B3 
##               2               2               2               1 
##             B30             B35             B37             B38 
##               1               2               1               1 
##             B39              B4             B41             B42 
##               1               1               1               1 
##             B49              B5             B50     B51 B53 B55 
##               2               2               1               2 
## B57 B59 B63 B66         B58 B60             B69             B71 
##               2               2               1               1 
##             B73             B77             B78             B79 
##               1               2               1               1 
##             B80         B82 B84             B86             B94 
##               1               1               1               1 
##         B96 B98            C101            C103            C104 
##               4               1               1               1 
##            C106            C110            C111            C118 
##               1               1               1               1 
##            C123            C124            C125            C126 
##               2               2               2               2 
##            C128            C148              C2         C22 C26 
##               1               1               2               3 
##     C23 C25 C27             C30             C32             C45 
##               4               1               1               1 
##             C46             C47             C49             C50 
##               1               1               1               1 
##             C52             C54         C62 C64             C65 
##               2               1               1               2 
##             C68              C7             C70             C78 
##               2               1               1               2 
##             C82             C83             C85             C86 
##               1               2               1               1 
##             C87             C90             C91             C92 
##               1               1               1               2 
##             C93             C95             C99               D 
##               2               1               1               3 
##         D10 D12             D11             D15             D17 
##               1               1               1               2 
##             D19             D20             D21             D26 
##               1               2               1               2 
##             D28             D30             D33             D35 
##               1               1               2               2 
##             D36             D37             D45             D46 
##               2               1               1               1 
##             D47             D48             D49             D50 
##               1               1               1               1 
##             D56              D6              D7              D9 
##               1               1               1               1 
##             E10            E101             E12            E121 
##               1               3               1               2 
##             E17             E24             E25             E31 
##               1               2               2               1 
##             E33             E34             E36             E38 
##               2               1               1               1 
##             E40             E44             E46             E49 
##               1               2               1               1 
##             E50             E58             E63             E67 
##               1               1               1               2 
##             E68             E77              E8           F E69 
##               1               1               2               1 
##           F G63           F G73              F2             F33 
##               1               2               3               3 
##             F38              F4              G6               T 
##               1               2               4               1
train2=train

train2$Cabin=NULL

summary(train2$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.42   20.12   28.00   29.70   38.00   80.00     177
summary(train2$Survived)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3838  1.0000  1.0000
table(train2$Age,train2$Survived)
##       
##         0  1
##   0.42  0  1
##   0.67  0  1
##   0.75  0  2
##   0.83  0  2
##   0.92  0  1
##   1     2  5
##   2     7  3
##   3     1  5
##   4     3  7
##   5     0  4
##   6     1  2
##   7     2  1
##   8     2  2
##   9     6  2
##   10    2  0
##   11    3  1
##   12    0  1
##   13    0  2
##   14    3  3
##   14.5  1  0
##   15    1  4
##   16   11  6
##   17    7  6
##   18   17  9
##   19   16  9
##   20   12  3
##   20.5  1  0
##   21   19  5
##   22   16 11
##   23   10  5
##   23.5  1  0
##   24   15 15
##   24.5  1  0
##   25   17  6
##   26   12  6
##   27    7 11
##   28   18  7
##   28.5  2  0
##   29   12  8
##   30   15 10
##   30.5  2  0
##   31    9  8
##   32    9  9
##   32.5  1  1
##   33    9  6
##   34    9  6
##   34.5  1  0
##   35    7 11
##   36   11 11
##   36.5  1  0
##   37    5  1
##   38    6  5
##   39    9  5
##   40    7  6
##   40.5  2  0
##   41    4  2
##   42    7  6
##   43    4  1
##   44    6  3
##   45    7  5
##   45.5  2  0
##   46    3  0
##   47    8  1
##   48    3  6
##   49    2  4
##   50    5  5
##   51    5  2
##   52    3  3
##   53    0  1
##   54    5  3
##   55    1  1
##   55.5  1  0
##   56    2  2
##   57    2  0
##   58    2  3
##   59    2  0
##   60    2  2
##   61    3  0
##   62    2  2
##   63    0  2
##   64    2  0
##   65    3  0
##   66    1  0
##   70    2  0
##   70.5  1  0
##   71    2  0
##   74    1  0
##   80    0  1
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
train2$Age2=discretize(train2$Age,"interval",10)
table(train2$Age2,train2$Survived)
##                
##                   0   1
##   [ 0.42, 8.38)  18  36
##   [ 8.38,16.34)  27  19
##   [16.34,24.29) 114  63
##   [24.29,32.25) 104  65
##   [32.25,40.21)  66  52
##   [40.21,48.17)  46  24
##   [48.17,56.13)  24  21
##   [56.13,64.08)  15   9
##   [64.08,72.04)   9   0
##   [72.04,80.00]   1   1
train3=train2[is.na(train2$Age),]
table(train3$Survived)
## 
##   0   1 
## 125  52
table(train$Survived)
## 
##   0   1 
## 549 342
table(train3$Pclass)
## 
##   1   2   3 
##  30  11 136
table(train$Pclass)
## 
##   1   2   3 
## 216 184 491
train4=train2[-is.na(train2$Age)]


library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
## 
##     mpg
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
median(train2$Age,na.rm=T)
## [1] 28
library(data.table)
train2=data.table(train2)
names(train2)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Embarked"    "Age2"
train2[,.(median(Age),na.rm=T),.(Pclass)]
##    Pclass V1 na.rm
## 1:      3 NA  TRUE
## 2:      1 NA  TRUE
## 3:      2 NA  TRUE
train5=na.omit(train2)
summarize(train5$Age,train5$Pclass,median)
##   train5$Pclass train5$Age
## 1             1         37
## 2             2         29
## 3             3         24
median(train5$Age,na.rm=T)
## [1] 28