Chapter 10 & 12 Homework

Chapter 10 Naive Bayesian Analysis

head(weather)

## # A tibble: 6 x 25
##   Date  Date_1 Location MinTemp MaxTemp Rainfall Evaporation Sunshine
##   <chr>  <dbl> <chr>      <dbl>   <dbl>    <dbl>       <dbl>    <dbl>
## 1 11/1~  39387 Canberra     8      24.3      0           3.4      6.3
## 2 11/2~  39388 Canberra    14      26.9      3.6         4.4      9.7
## 3 11/3~  39389 Canberra    13.7    23.4      3.6         5.8      3.3
## 4 11/4~  39390 Canberra    13.3    15.5     39.8         7.2      9.1
## 5 11/5~  39391 Canberra     7.6    16.1      2.8         5.6     10.6
## 6 11/6~  39392 Canberra     6.2    16.9      0           5.8      8.2
## # ... with 17 more variables: WindGustDir <chr>, WindGustSpeed <dbl>,
## #   WindDir9am <chr>, WindDir3pm <chr>, WindSpeed9am <dbl>, WindSpeed3pm <dbl>,
## #   Humidity9am <dbl>, Humidity3pm <dbl>, Pressure9am <dbl>, Pressure3pm <dbl>,
## #   Cloud9am <dbl>, Cloud3pm <dbl>, Temp9am <dbl>, Temp3pm <dbl>,
## #   RainToday <chr>, RISK_MM <dbl>, RainTomorrow <chr>

set.seed(1)
weath=data.frame(weather)
weath$Datef=factor(floor(weath$Date_1/100))
weath$RainTomorrow<-recode(weath$RainTomorrow,"'Yes'=1;else=0")

response<-weath$RainTomorrow
hist(response)

mm=mean(response)
mm

## [1] 0.1803279

n=length(weath$WindGustDir)
n

## [1] 366

n1=floor(n*(0.6))
n1

## [1] 219

n2=n-n1
n2

## [1] 147

train=sample(1:n,n1)

tttt=cbind(weath$Datef[train],weath$Location[train],weath$dest[train],weath$MaxTemp[train],weath$MinTemp[train],weath$Rainfall[train],weath$RISK_MM[train],response[train])
head(tttt)

##      [,1] [,2]       [,3]   [,4]   [,5] [,6]  [,7]
## [1,] "5"  "Canberra" "23.3" "5.1"  "0"  "0"   "0" 
## [2,] "3"  "Canberra" "20.7" "5.4"  "0"  "0"   "0" 
## [3,] "3"  "Canberra" "27.4" "9.5"  "3"  "0"   "0" 
## [4,] "4"  "Canberra" "14.2" "-0.9" "0"  "0"   "0" 
## [5,] "4"  "Canberra" "9.7"  "-2.3" "0"  "1.4" "1" 
## [6,] "3"  "Canberra" "19"   "0.4"  "0"  "0"   "0"

tdel=table(response[train])    
tdel

## 
##   0   1 
## 180  39

tdel=tdel/sum(tdel)           
tdel

## 
##         0         1 
## 0.8219178 0.1780822

tttrain0=tttt[tttt[,7]<0.5,] 
head(tttrain0)

##      [,1] [,2]       [,3]   [,4]   [,5] [,6] [,7]
## [1,] "5"  "Canberra" "23.3" "5.1"  "0"  "0"  "0" 
## [2,] "3"  "Canberra" "20.7" "5.4"  "0"  "0"  "0" 
## [3,] "3"  "Canberra" "27.4" "9.5"  "3"  "0"  "0" 
## [4,] "4"  "Canberra" "14.2" "-0.9" "0"  "0"  "0" 
## [5,] "3"  "Canberra" "19"   "0.4"  "0"  "0"  "0" 
## [6,] "4"  "Canberra" "16.7" "0.1"  "0"  "0"  "0"

tttrain1=tttt[tttt[,7]>0.5,]  
head(tttrain1)

##      [,1] [,2]       [,3]   [,4]   [,5]  [,6]   [,7]
## [1,] "4"  "Canberra" "9.7"  "-2.3" "0"   "1.4"  "1" 
## [2,] "4"  "Canberra" "11"   "-1.1" "0.2" "19.2" "1" 
## [3,] "2"  "Canberra" "20.4" "15.1" "0"   "18.8" "1" 
## [4,] "2"  "Canberra" "25.8" "17.2" "0"   "4"    "1" 
## [5,] "3"  "Canberra" "19.8" "7.1"  "2"   "5.2"  "1" 
## [6,] "2"  "Canberra" "29.9" "10.1" "0"   "5.4"  "1"

tdel=table(response[train])    
tdel

## 
##   0   1 
## 180  39

tdel=tdel/sum(tdel)            
tdel

## 
##         0         1 
## 0.8219178 0.1780822

ts0=table(tttrain0[,1])
ts0

## 
##  1  2  3  4  5 
##  1 46 55 51 27

ts0=ts0/sum(ts0)
ts0

## 
##           1           2           3           4           5 
## 0.005555556 0.255555556 0.305555556 0.283333333 0.150000000

ts1=table(tttrain1[,1])
ts1=ts1/sum(ts1)
ts1

## 
##          1          2          3          4          5 
## 0.05128205 0.48717949 0.12820513 0.20512821 0.12820513

tc0=table(tttrain0[,2])
tc0

## 
## Canberra 
##      180

tc0=tc0/sum(tc0)
tc0

## 
## Canberra 
##        1

tc1=table(tttrain1[,2])
tc1=tc1/sum(tc1)
tc1

## 
## Canberra 
##        1

td0=table(tttrain0[,3])
head(td0)

## 
## 10.7 11.1 11.3 11.5 11.6 11.7 
##    1    1    1    1    3    1

td0=td0/sum(td0)
head(tc0)

## 
## Canberra 
##        1

td1=table(tttrain1[,3])
td1=td1/sum(td1)
head(td1)

## 
##       10.9         11       15.9       16.3       17.2       17.5 
## 0.02564103 0.02564103 0.02564103 0.05128205 0.02564103 0.02564103

to0=table(tttrain0[,4])
head(to0)

## 
## -0.1 -0.3 -0.5 -0.6 -0.9   -1 
##    2    1    1    1    3    1

to0=to0/sum(to0)
head(to0)

## 
##        -0.1        -0.3        -0.5        -0.6        -0.9          -1 
## 0.011111111 0.005555556 0.005555556 0.005555556 0.016666667 0.005555556

to1=table(tttrain1[,4])
to1=to1/sum(to1)
head(to1)

## 
##       -0.3       -1.1       -1.9       -2.3        0.5        1.8 
## 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103

tc0=table(tttrain0[,5])
head(tc0)

## 
##   0 0.2 0.4 0.6 0.8   1 
## 140   8   2   3   1   2

tc0=tc0/sum(tc0)
head(tc0)

## 
##           0         0.2         0.4         0.6         0.8           1 
## 0.777777778 0.044444444 0.011111111 0.016666667 0.005555556 0.011111111

tc1=table(tttrain1[,5])
tc1=tc1/sum(tc1)
head(tc1)

## 
##          0        0.2        0.4        0.6        1.6        1.8 
## 0.56410256 0.05128205 0.05128205 0.02564103 0.02564103 0.02564103

tw0=table(tttrain0[,6])
head(tw0)

## 
##   0 0.2 0.4 0.6 0.8   1 
## 161   6   2   5   3   3

tw0=tw0/sum(tw0)
head(tw0)

## 
##          0        0.2        0.4        0.6        0.8          1 
## 0.89444444 0.03333333 0.01111111 0.02777778 0.01666667 0.01666667

tw1=table(tttrain1[,6])
tw1=tc1/sum(tw1)
head(tw1)

## 
##            0          0.2          0.4          0.6          1.6          1.8 
## 0.0144641683 0.0013149244 0.0013149244 0.0006574622 0.0006574622 0.0006574622

tdw0=table(tttrain0[,7])
tdw0

## 
##   0 
## 180

tdw0=tdw0/sum(tdw0)
tdw0

## 
## 0 
## 1

tdw1=table(tttrain1[,7])
tdw1=tdw1/sum(tdw1)
tdw1

## 
## 1 
## 1

tt=cbind(weath$Datef[train],weath$Location[train],weath$dest[train],weath$MaxTemp[train],weath$MinTemp[train],weath$Rainfall[train],weath$RISK_MM[train],response[train])
head(tt,10)

##       [,1] [,2]       [,3]   [,4]   [,5] [,6]  [,7]
##  [1,] "5"  "Canberra" "23.3" "5.1"  "0"  "0"   "0" 
##  [2,] "3"  "Canberra" "20.7" "5.4"  "0"  "0"   "0" 
##  [3,] "3"  "Canberra" "27.4" "9.5"  "3"  "0"   "0" 
##  [4,] "4"  "Canberra" "14.2" "-0.9" "0"  "0"   "0" 
##  [5,] "4"  "Canberra" "9.7"  "-2.3" "0"  "1.4" "1" 
##  [6,] "3"  "Canberra" "19"   "0.4"  "0"  "0"   "0" 
##  [7,] "4"  "Canberra" "16.7" "0.1"  "0"  "0"   "0" 
##  [8,] "2"  "Canberra" "27.8" "10.3" "0"  "0"   "0" 
##  [9,] "4"  "Canberra" "12.8" "2.3"  "0"  "0"   "0" 
## [10,] "5"  "Canberra" "18.7" "3.2"  "0"  "0"   "0"

ttcol1<-tt[,1]
head(ttcol1)

## [1] "5" "3" "3" "4" "4" "3"

ttcol1[1]

## [1] "5"

ts0[ttcol1[1]]

##    5 
## 0.15

ts0

## 
##           1           2           3           4           5 
## 0.005555556 0.255555556 0.305555556 0.283333333 0.150000000

p0=ts0[tt[,1]]*tc0[tt[,2]]*td0[tt[,3]]*to0[tt[,4]]*tw0[tt[,5]]*tdw0[tt[,6]]
head(p0)

## 
## 5 3 3 4 4 3 
##

p1=ts1[tt[,1]]*tc1[tt[,2]]*td1[tt[,3]]*to1[tt[,4]]*tw1[tt[,5]]*tdw1[tt[,6]]
head(p1)

## 
## 5 3 3 4 4 3 
##

gg=(p1*tdel[2])/(p1*tdel[2]+p0*tdel[1])  
head(gg,10)

## 
## 5 3 3 4 4 3 4 2 4 5 
##

bb=cbind(gg,actual=response[-train])

## Warning in cbind(gg, actual = response[-train]): number of rows of result is not
## a multiple of vector length (arg 2)

head(bb,10)

##   gg actual
## 5 NA      1
## 3 NA      1
## 3 NA      1
## 4 NA      0
## 4 NA      0
## 3 NA      0
## 4 NA      0
## 2 NA      0
## 4 NA      0
## 5 NA      0

bb1=bb[order(gg,decreasing=TRUE),] 
head(bb1,10)

##   gg actual
## 5 NA      1
## 3 NA      1
## 3 NA      1
## 4 NA      0
## 4 NA      0
## 3 NA      0
## 4 NA      0
## 2 NA      0
## 4 NA      0
## 5 NA      0

xbar=mean(response[-train])
xbar

## [1] 0.1836735

n2

## [1] 147

axis=dim(n2)                          
ax=dim(n2)                             
ay=dim(n2)                             
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]                         
for (i in 2:n2) {
  axis[i]=i                            
  ax[i]=xbar*i                           
  ay[i]=ay[i-1]+bb1[i,2]   
  if(i<11) print(c(axis[i], ax[i], ay[i]))
}

## [1] 2.0000000 0.3673469 2.0000000
## [1] 3.0000000 0.5510204 3.0000000
## [1] 4.0000000 0.7346939 3.0000000
## [1] 5.0000000 0.9183673 3.0000000
## [1] 6.000000 1.102041 3.000000
## [1] 7.000000 1.285714 3.000000
## [1] 8.000000 1.469388 3.000000
## [1] 9.000000 1.653061 3.000000
## [1] 10.000000  1.836735  3.000000

aaa=cbind(bb1[,1],bb1[,2],ay,ax)

## Warning in cbind(bb1[, 1], bb1[, 2], ay, ax): number of rows of result is not a
## multiple of vector length (arg 3)

dim(aaa)

## [1] 219   4

head(aaa,10)

##        ay        ax
## 5 NA 1  1 0.1836735
## 3 NA 1  2 0.3673469
## 3 NA 1  3 0.5510204
## 4 NA 0  3 0.7346939
## 4 NA 0  3 0.9183673
## 3 NA 0  3 1.1020408
## 4 NA 0  3 1.2857143
## 2 NA 0  3 1.4693878
## 4 NA 0  3 1.6530612
## 5 NA 0  3 1.8367347

aaa[1:10,]

##        ay        ax
## 5 NA 1  1 0.1836735
## 3 NA 1  2 0.3673469
## 3 NA 1  3 0.5510204
## 4 NA 0  3 0.7346939
## 4 NA 0  3 0.9183673
## 3 NA 0  3 1.1020408
## 4 NA 0  3 1.2857143
## 2 NA 0  3 1.4693878
## 4 NA 0  3 1.6530612
## 5 NA 0  3 1.8367347

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

## Chapter 12 Discriminant Analysis

updatedweather <- read_csv("updatedweather.csv")

## Warning: Duplicated column names deduplicated: 'Date' => 'Date_1' [2]

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_character(),
##   Location = col_character(),
##   WindGustDir = col_character(),
##   WindDir9am = col_character(),
##   WindDir3pm = col_character(),
##   RainToday = col_character(),
##   RainTomorrow = col_character()
## )

## See spec(...) for full column specifications.

upweath=data.frame(updatedweather)
head(upweath)

##        Date Date_1 Location MinTemp MaxTemp Rainfall Evaporation Sunshine
## 1 11/1/2007  39387 Canberra     8.0    24.3      0.0         3.4      6.3
## 2 11/2/2007  39388 Canberra    14.0    26.9      3.6         4.4      9.7
## 3 11/3/2007  39389 Canberra    13.7    23.4      3.6         5.8      3.3
## 4 11/4/2007  39390 Canberra    13.3    15.5     39.8         7.2      9.1
## 5 11/5/2007  39391 Canberra     7.6    16.1      2.8         5.6     10.6
## 6 11/6/2007  39392 Canberra     6.2    16.9      0.0         5.8      8.2
##   WindGustDir WindGustSpeed WindDir9am WindDir3pm WindSpeed9am WindSpeed3pm
## 1          NW            30         SW         NW            6           20
## 2         ENE            39          E          W            4           17
## 3          NW            85          N        NNE            6            6
## 4          NW            54        WNW          W           30           24
## 5         SSE            50        SSE        ESE           20           28
## 6          SE            44         SE          E           20           24
##   Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am
## 1          68          29      1019.7      1015.0        7        7    14.4
## 2          80          36      1012.4      1008.4        5        3    17.5
## 3          82          69      1009.5      1007.2        8        7    15.4
## 4          62          56      1005.5      1007.0        2        7    13.5
## 5          68          49      1018.3      1018.5        7        7    11.1
## 6          70          57      1023.8      1021.7        7        5    10.9
##   Temp3pm RainToday RISK_MM RainTomorrow
## 1    23.6        No     3.6          Yes
## 2    25.7       Yes     3.6          Yes
## 3    20.2       Yes    39.8          Yes
## 4    14.1       Yes     2.8          Yes
## 5    15.4       Yes     0.0           No
## 6    14.8        No     0.2           No

upweath1=updatedweather[,c("MaxTemp","MinTemp","Rainfall","RainTomorrow")]
head(upweath1)

## # A tibble: 6 x 4
##   MaxTemp MinTemp Rainfall RainTomorrow
##     <dbl>   <dbl>    <dbl> <chr>       
## 1    24.3     8        0   Yes         
## 2    26.9    14        3.6 Yes         
## 3    23.4    13.7      3.6 Yes         
## 4    15.5    13.3     39.8 Yes         
## 5    16.1     7.6      2.8 No          
## 6    16.9     6.2      0   No

summary(upweath1)

##     MaxTemp         MinTemp          Rainfall      RainTomorrow      
##  Min.   : 7.60   Min.   :-5.300   Min.   : 0.000   Length:366        
##  1st Qu.:15.03   1st Qu.: 2.300   1st Qu.: 0.000   Class :character  
##  Median :19.65   Median : 7.450   Median : 0.000   Mode  :character  
##  Mean   :20.55   Mean   : 7.266   Mean   : 1.428                     
##  3rd Qu.:25.50   3rd Qu.:12.500   3rd Qu.: 0.200                     
##  Max.   :35.80   Max.   :20.900   Max.   :39.800

m1=lda(RainTomorrow~.,upweath1)
m1

## Call:
## lda(RainTomorrow ~ ., data = upweath1)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.8196721 0.1803279 
## 
## Group means:
##      MaxTemp   MinTemp Rainfall
## No  20.39600  6.607333 1.164000
## Yes 21.25152 10.257576 2.630303
## 
## Coefficients of linear discriminants:
##                  LD1
## MaxTemp  -0.14390495
## MinTemp   0.25130654
## Rainfall  0.02028501

predict(m1,newdata=data.frame(MaxTemp=16.9,MinTemp=6.2,Rainfall=0))

## $class
## [1] No
## Levels: No Yes
## 
## $posterior
##          No       Yes
## 1 0.8238795 0.1761205
## 
## $x
##         LD1
## 1 0.2285313

m2=qda(RainTomorrow~.,upweath1)
m2

## Call:
## qda(RainTomorrow ~ ., data = upweath1)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.8196721 0.1803279 
## 
## Group means:
##      MaxTemp   MinTemp Rainfall
## No  20.39600  6.607333 1.164000
## Yes 21.25152 10.257576 2.630303

predict(m2,newdata=data.frame(MaxTemp=16.9,MinTemp=6.2,Rainfall=0))

## $class
## [1] No
## Levels: No Yes
## 
## $posterior
##         No      Yes
## 1 0.887376 0.112624

n=366
nt=330
neval=n-nt

rep=100
errlin=dim(rep)

for (k in 1:rep) {
  train=sample(1:n,nt)
  m1=lda(RainTomorrow~.,upweath1[train,])
  predict(m1,upweath1[-train,])$class
  tablin=table(upweath1$RainTomorrow[-train],predict(m1,upweath1[-train,])$class)
  errlin[k]=(neval-sum(diag(tablin)))/neval
}
merrlin=mean(errlin)
merrlin

## [1] 0.1808333

Chapter 10 & 12 Homework

Nickolas Parker

10/16/2020

Introduction

Chapter 10 Naive Bayesian Analysis

Conclusion