#slide 181-187
# A data frame with 32 observations on 11 variables.
#
# [, 1] mpg Miles/(US) gallon
# [, 2] cyl Number of cylinders
# [, 3] disp Displacement (cu.in.)
# [, 4] hp Gross horsepower
# [, 5] drat Rear axle ratio
# [, 6] wt Weight (1000 lbs)
# [, 7] qsec 1/4 mile time
# [, 8] vs V/S
# [, 9] am Transmission (0 = automatic, 1 = manual)
# [,10] gear Number of forward gears
# [,11] carb Number of carburetors
dim(mtcars)
## [1] 32 11
par(mfrow=c(3,4))
sapply(mtcars,hist)
## mpg cyl disp hp drat wt
## breaks Numeric,6 Numeric,9 Numeric,10 Numeric,7 Numeric,6 Numeric,9
## counts Integer,5 Integer,8 Integer,9 Integer,6 Integer,5 Integer,8
## density Numeric,5 Numeric,8 Numeric,9 Numeric,6 Numeric,5 Numeric,8
## mids Numeric,5 Numeric,8 Numeric,9 Numeric,6 Numeric,5 Numeric,8
## xname "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]"
## equidist TRUE TRUE TRUE TRUE TRUE TRUE
## qsec vs am gear carb
## breaks Numeric,10 Numeric,6 Numeric,6 Numeric,5 Numeric,8
## counts Integer,9 Integer,5 Integer,5 Integer,4 Integer,7
## density Numeric,9 Numeric,5 Numeric,5 Numeric,4 Numeric,7
## mids Numeric,9 Numeric,5 Numeric,5 Numeric,4 Numeric,7
## xname "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]"
## equidist TRUE TRUE TRUE TRUE TRUE
sapply(mtcars,boxplot)

## mpg cyl disp hp drat wt
## stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
## n 32 32 32 32 32 32
## conf Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
## out Numeric,0 Numeric,0 Numeric,0 335 Numeric,0 Numeric,2
## group Numeric,0 Numeric,0 Numeric,0 1 Numeric,0 Numeric,2
## names "1" "1" "1" "1" "1" "1"
## qsec vs am gear carb
## stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
## n 32 32 32 32 32
## conf Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
## out 22.9 Numeric,0 Numeric,0 Numeric,0 8
## group 1 Numeric,0 Numeric,0 Numeric,0 1
## names "1" "1" "1" "1" "1"
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
sapply(mtcars,min)
## mpg cyl disp hp drat wt qsec vs am gear
## 10.400 4.000 71.100 52.000 2.760 1.513 14.500 0.000 0.000 3.000
## carb
## 1.000
sapply(mtcars,mean)
## mpg cyl disp hp drat wt
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250
## qsec vs am gear carb
## 17.848750 0.437500 0.406250 3.687500 2.812500
sapply(mtcars,median)
## mpg cyl disp hp drat wt qsec vs am
## 19.200 6.000 196.300 123.000 3.695 3.325 17.710 0.000 0.000
## gear carb
## 4.000 2.000
sapply(mtcars,max)
## mpg cyl disp hp drat wt qsec vs am
## 33.900 8.000 472.000 335.000 4.930 5.424 22.900 1.000 1.000
## gear carb
## 5.000 8.000
sapply(mtcars,sd)
## mpg cyl disp hp drat wt
## 6.0269481 1.7859216 123.9386938 68.5628685 0.5346787 0.9784574
## qsec vs am gear carb
## 1.7869432 0.5040161 0.4989909 0.7378041 1.6152000
sapply(mtcars,IQR)
## mpg cyl disp hp drat wt qsec
## 7.37500 4.00000 205.17500 83.50000 0.84000 1.02875 2.00750
## vs am gear carb
## 1.00000 1.00000 1.00000 2.00000
sapply(mtcars,var)
## mpg cyl disp hp drat
## 3.632410e+01 3.189516e+00 1.536080e+04 4.700867e+03 2.858814e-01
## wt qsec vs am gear
## 9.573790e-01 3.193166e+00 2.540323e-01 2.489919e-01 5.443548e-01
## carb
## 2.608871e+00
sapply(mtcars,quantile)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0% 10.400 4 71.100 52.0 2.760 1.51300 14.5000 0 0 3 1
## 25% 15.425 4 120.825 96.5 3.080 2.58125 16.8925 0 0 3 2
## 50% 19.200 6 196.300 123.0 3.695 3.32500 17.7100 0 0 4 2
## 75% 22.800 8 326.000 180.0 3.920 3.61000 18.9000 1 1 4 4
## 100% 33.900 8 472.000 335.0 4.930 5.42400 22.9000 1 1 5 8
sapply(mtcars,Mode)
## mpg cyl disp hp drat wt qsec vs am gear
## 21.00 8.00 275.80 110.00 3.92 3.44 17.02 0.00 0.00 3.00
## carb
## 4.00
sapply(mtcars,range)
## mpg cyl disp hp drat wt qsec vs am gear carb
## [1,] 10.4 4 71.1 52 2.76 1.513 14.5 0 0 3 1
## [2,] 33.9 8 472.0 335 4.93 5.424 22.9 1 1 5 8
newp=function(x){
par(mfrow=c(1,3))
hist(x, breaks = 10,col = heat.colors(5),main=print(names(x)))
boxplot(x,col = topo.colors(5),main=print(names(x)))
print(names(x))
}
newp(mtcars$gear)

## NULL
## NULL
## NULL
sapply(mtcars,newp)

## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL

## NULL
## NULL
## NULL
## NULL
## $mpg
## NULL
##
## $cyl
## NULL
##
## $disp
## NULL
##
## $hp
## NULL
##
## $drat
## NULL
##
## $wt
## NULL
##
## $qsec
## NULL
##
## $vs
## NULL
##
## $am
## NULL
##
## $gear
## NULL
##
## $carb
## NULL
par(mfrow=c(3,4))

names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
newx=function(x){
plot(x,mtcars$mpg,col=rainbow(7))
}
sapply(mtcars,newx)
## $mpg
## NULL
##
## $cyl
## NULL
##
## $disp
## NULL
##
## $hp
## NULL
##
## $drat
## NULL
##
## $wt
## NULL
##
## $qsec
## NULL
##
## $vs
## NULL
##
## $am
## NULL
##
## $gear
## NULL
##
## $carb
## NULL
attach(mtcars)
unique(cyl)
## [1] 6 4 8
par(mfrow=c(3,4))

newb=function(x){
boxplot(mtcars$mpg~x,col=rainbow(7))
}
sapply(mtcars,newb)
## mpg cyl disp hp drat
## stats Numeric,125 Numeric,15 Numeric,135 Numeric,110 Numeric,110
## n Numeric,25 Numeric,3 Numeric,27 Numeric,22 Numeric,22
## conf Numeric,50 Numeric,6 Numeric,54 Numeric,44 Numeric,44
## out Numeric,0 Numeric,2 Numeric,0 Numeric,0 Numeric,0
## group Numeric,0 Numeric,2 Numeric,0 Numeric,0 Numeric,0
## names Character,25 Character,3 Character,27 Character,22 Character,22
## wt qsec vs am gear
## stats Numeric,145 Numeric,150 Numeric,10 Numeric,10 Numeric,15
## n Numeric,29 Numeric,30 Numeric,2 Numeric,2 Numeric,3
## conf Numeric,58 Numeric,60 Numeric,4 Numeric,4 Numeric,6
## out Numeric,0 Numeric,0 26 Numeric,0 Numeric,0
## group Numeric,0 Numeric,0 1 Numeric,0 Numeric,0
## names Character,29 Character,30 Character,2 Character,2 Character,3
## carb
## stats Numeric,30
## n Numeric,6
## conf Numeric,12
## out Numeric,0
## group Numeric,0
## names Character,6
par(mfrow=c(1,2))

boxplot(cyl~mtcars$mpg,col=rainbow(7))
boxplot(mtcars$mpg~cyl,col=rainbow(7))
a=table(gear,cyl)
a
## cyl
## gear 4 6 8
## 3 1 2 12
## 4 8 4 0
## 5 2 1 2
chisq.test(a)
## Warning in chisq.test(a): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: a
## X-squared = 18.036, df = 4, p-value = 0.001214
#?mtcars
#install.packages("vcd")
library(vcd)
## Loading required package: grid

mosaic(a)

library(RColorBrewer)
counts=table(vs,gear)
counts
## gear
## vs 3 4 5
## 0 12 2 4
## 1 3 10 1
mosaic(counts)

barplot(counts, legend=rownames(counts),col = brewer.pal(3,"Set1"))
barplot(a, legend=rownames(a),col = brewer.pal(3,"Greens"))

chisq.test(counts)
## Warning in chisq.test(counts): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: counts
## X-squared = 12.224, df = 2, p-value = 0.002216
#chisquare test
#http://www.r-tutor.com/elementary-statistics/goodness-fit/chi-squared-test-independence
library(MASS)
str(survey)
## 'data.frame': 237 obs. of 12 variables:
## $ Sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
## $ Wr.Hnd: num 18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
## $ NW.Hnd: num 18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
## $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
## $ Fold : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
## $ Pulse : int 92 104 87 NA 35 64 83 74 72 90 ...
## $ Clap : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
## $ Exer : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
## $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ Height: num 173 178 NA 160 165 ...
## $ M.I : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
## $ Age : num 18.2 17.6 16.9 20.3 23.7 ...
tb1=table(survey$Smoke,survey$Exer)
tb1
##
## Freq None Some
## Heavy 7 1 3
## Never 87 18 84
## Occas 12 3 4
## Regul 9 1 7
#mosaic(tb1)
chisq.test(tb1)
## Warning in chisq.test(tb1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tb1
## X-squared = 5.4885, df = 6, p-value = 0.4828
#As the p-value 0.4828 is greater than the .05 significance level, we do not reject
#the null hypothesis that the smoking habit is independent of the exercise level of the students.
#Test the hypothesis whether the students smoking habit is
#independent of their exercise level at .05 significance level.
#
# > library(readr)
# > train <- read_csv("C:/Users/AH0158691/Downloads/train.csv")
# Parsed with column specification:
# cols(
# PassengerId = col_integer(),
# Survived = col_integer(),
# Pclass = col_integer(),
# Name = col_character(),
# Sex = col_character(),
# Age = col_double(),
# SibSp = col_integer(),
# Parch = col_integer(),
# Ticket = col_character(),
# Fare = col_double(),
# Cabin = col_character(),
# Embarked = col_character()
# )
# > View(train)
getwd()
## [1] "C:/Users/AH0158691/Documents"
train <- read.csv("C:/Users/AH0158691/Downloads/train.csv")
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
head(train$Cabin,10)
## [1] C85 C123 E46
## 148 Levels: A10 A14 A16 A19 A20 A23 A24 A26 A31 A32 A34 A36 A5 A6 ... T
summary(train)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
table(is.na(train$Cabin))
##
## FALSE
## 891
table(is.na(train$Cabin))
##
## FALSE
## 891
table(is.na(train$Pclass))
##
## FALSE
## 891
table(is.na(train$Embarked))
##
## FALSE
## 891
table(train$Cabin)
##
## A10 A14 A16
## 687 1 1 1
## A19 A20 A23 A24
## 1 1 1 1
## A26 A31 A32 A34
## 1 1 1 1
## A36 A5 A6 A7
## 1 1 1 1
## B101 B102 B18 B19
## 1 1 2 1
## B20 B22 B28 B3
## 2 2 2 1
## B30 B35 B37 B38
## 1 2 1 1
## B39 B4 B41 B42
## 1 1 1 1
## B49 B5 B50 B51 B53 B55
## 2 2 1 2
## B57 B59 B63 B66 B58 B60 B69 B71
## 2 2 1 1
## B73 B77 B78 B79
## 1 2 1 1
## B80 B82 B84 B86 B94
## 1 1 1 1
## B96 B98 C101 C103 C104
## 4 1 1 1
## C106 C110 C111 C118
## 1 1 1 1
## C123 C124 C125 C126
## 2 2 2 2
## C128 C148 C2 C22 C26
## 1 1 2 3
## C23 C25 C27 C30 C32 C45
## 4 1 1 1
## C46 C47 C49 C50
## 1 1 1 1
## C52 C54 C62 C64 C65
## 2 1 1 2
## C68 C7 C70 C78
## 2 1 1 2
## C82 C83 C85 C86
## 1 2 1 1
## C87 C90 C91 C92
## 1 1 1 2
## C93 C95 C99 D
## 2 1 1 3
## D10 D12 D11 D15 D17
## 1 1 1 2
## D19 D20 D21 D26
## 1 2 1 2
## D28 D30 D33 D35
## 1 1 2 2
## D36 D37 D45 D46
## 2 1 1 1
## D47 D48 D49 D50
## 1 1 1 1
## D56 D6 D7 D9
## 1 1 1 1
## E10 E101 E12 E121
## 1 3 1 2
## E17 E24 E25 E31
## 1 2 2 1
## E33 E34 E36 E38
## 2 1 1 1
## E40 E44 E46 E49
## 1 2 1 1
## E50 E58 E63 E67
## 1 1 1 2
## E68 E77 E8 F E69
## 1 1 2 1
## F G63 F G73 F2 F33
## 1 2 3 3
## F38 F4 G6 T
## 1 2 4 1
train2=train
train2$Cabin=NULL
summary(train2$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 20.12 28.00 29.70 38.00 80.00 177
summary(train2$Survived)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3838 1.0000 1.0000
table(train2$Age,train2$Survived)
##
## 0 1
## 0.42 0 1
## 0.67 0 1
## 0.75 0 2
## 0.83 0 2
## 0.92 0 1
## 1 2 5
## 2 7 3
## 3 1 5
## 4 3 7
## 5 0 4
## 6 1 2
## 7 2 1
## 8 2 2
## 9 6 2
## 10 2 0
## 11 3 1
## 12 0 1
## 13 0 2
## 14 3 3
## 14.5 1 0
## 15 1 4
## 16 11 6
## 17 7 6
## 18 17 9
## 19 16 9
## 20 12 3
## 20.5 1 0
## 21 19 5
## 22 16 11
## 23 10 5
## 23.5 1 0
## 24 15 15
## 24.5 1 0
## 25 17 6
## 26 12 6
## 27 7 11
## 28 18 7
## 28.5 2 0
## 29 12 8
## 30 15 10
## 30.5 2 0
## 31 9 8
## 32 9 9
## 32.5 1 1
## 33 9 6
## 34 9 6
## 34.5 1 0
## 35 7 11
## 36 11 11
## 36.5 1 0
## 37 5 1
## 38 6 5
## 39 9 5
## 40 7 6
## 40.5 2 0
## 41 4 2
## 42 7 6
## 43 4 1
## 44 6 3
## 45 7 5
## 45.5 2 0
## 46 3 0
## 47 8 1
## 48 3 6
## 49 2 4
## 50 5 5
## 51 5 2
## 52 3 3
## 53 0 1
## 54 5 3
## 55 1 1
## 55.5 1 0
## 56 2 2
## 57 2 0
## 58 2 3
## 59 2 0
## 60 2 2
## 61 3 0
## 62 2 2
## 63 0 2
## 64 2 0
## 65 3 0
## 66 1 0
## 70 2 0
## 70.5 1 0
## 71 2 0
## 74 1 0
## 80 0 1
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
train2$Age2=discretize(train2$Age,"interval",10)
table(train2$Age2,train2$Survived)
##
## 0 1
## [ 0.42, 8.38) 18 36
## [ 8.38,16.34) 27 19
## [16.34,24.29) 114 63
## [24.29,32.25) 104 65
## [32.25,40.21) 66 52
## [40.21,48.17) 46 24
## [48.17,56.13) 24 21
## [56.13,64.08) 15 9
## [64.08,72.04) 9 0
## [72.04,80.00] 1 1
train3=train2[is.na(train2$Age),]
table(train3$Survived)
##
## 0 1
## 125 52
table(train$Survived)
##
## 0 1
## 549 342
table(train3$Pclass)
##
## 1 2 3
## 30 11 136
table(train$Pclass)
##
## 1 2 3
## 216 184 491
train4=train2[-is.na(train2$Age)]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'mtcars':
##
## mpg
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
median(train2$Age,na.rm=T)
## [1] 28
library(data.table)
train2=data.table(train2)
names(train2)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Embarked" "Age2"
train2[,.(median(Age),na.rm=T),.(Pclass)]
## Pclass V1 na.rm
## 1: 3 NA TRUE
## 2: 1 NA TRUE
## 3: 2 NA TRUE
train5=na.omit(train2)
summarize(train5$Age,train5$Pclass,median)
## train5$Pclass train5$Age
## 1 1 37
## 2 2 29
## 3 3 24
median(train5$Age,na.rm=T)
## [1] 28