Introduction

This document looks at naive bayesian analysis and discriminant analysis. In particular, this document uses the weather.csv and looks at predicting rainfall and the likeliness of rainfall tomorrow.

Chapter 10 Naive Bayesian Analysis

head(weather)
## # A tibble: 6 x 25
##   Date  Date_1 Location MinTemp MaxTemp Rainfall Evaporation Sunshine
##   <chr>  <dbl> <chr>      <dbl>   <dbl>    <dbl>       <dbl>    <dbl>
## 1 11/1~  39387 Canberra     8      24.3      0           3.4      6.3
## 2 11/2~  39388 Canberra    14      26.9      3.6         4.4      9.7
## 3 11/3~  39389 Canberra    13.7    23.4      3.6         5.8      3.3
## 4 11/4~  39390 Canberra    13.3    15.5     39.8         7.2      9.1
## 5 11/5~  39391 Canberra     7.6    16.1      2.8         5.6     10.6
## 6 11/6~  39392 Canberra     6.2    16.9      0           5.8      8.2
## # ... with 17 more variables: WindGustDir <chr>, WindGustSpeed <dbl>,
## #   WindDir9am <chr>, WindDir3pm <chr>, WindSpeed9am <dbl>, WindSpeed3pm <dbl>,
## #   Humidity9am <dbl>, Humidity3pm <dbl>, Pressure9am <dbl>, Pressure3pm <dbl>,
## #   Cloud9am <dbl>, Cloud3pm <dbl>, Temp9am <dbl>, Temp3pm <dbl>,
## #   RainToday <chr>, RISK_MM <dbl>, RainTomorrow <chr>
set.seed(1)
weath=data.frame(weather)
weath$Datef=factor(floor(weath$Date_1/100))
weath$RainTomorrow<-recode(weath$RainTomorrow,"'Yes'=1;else=0")
response<-weath$RainTomorrow
hist(response)

mm=mean(response)
mm
## [1] 0.1803279
n=length(weath$WindGustDir)
n
## [1] 366
n1=floor(n*(0.6))
n1
## [1] 219
n2=n-n1
n2
## [1] 147
train=sample(1:n,n1)
tttt=cbind(weath$Datef[train],weath$Location[train],weath$dest[train],weath$MaxTemp[train],weath$MinTemp[train],weath$Rainfall[train],weath$RISK_MM[train],response[train])
head(tttt)  
##      [,1] [,2]       [,3]   [,4]   [,5] [,6]  [,7]
## [1,] "5"  "Canberra" "23.3" "5.1"  "0"  "0"   "0" 
## [2,] "3"  "Canberra" "20.7" "5.4"  "0"  "0"   "0" 
## [3,] "3"  "Canberra" "27.4" "9.5"  "3"  "0"   "0" 
## [4,] "4"  "Canberra" "14.2" "-0.9" "0"  "0"   "0" 
## [5,] "4"  "Canberra" "9.7"  "-2.3" "0"  "1.4" "1" 
## [6,] "3"  "Canberra" "19"   "0.4"  "0"  "0"   "0"
tdel=table(response[train])    
tdel 
## 
##   0   1 
## 180  39
tdel=tdel/sum(tdel)           
tdel  
## 
##         0         1 
## 0.8219178 0.1780822
tttrain0=tttt[tttt[,7]<0.5,] 
head(tttrain0)  
##      [,1] [,2]       [,3]   [,4]   [,5] [,6] [,7]
## [1,] "5"  "Canberra" "23.3" "5.1"  "0"  "0"  "0" 
## [2,] "3"  "Canberra" "20.7" "5.4"  "0"  "0"  "0" 
## [3,] "3"  "Canberra" "27.4" "9.5"  "3"  "0"  "0" 
## [4,] "4"  "Canberra" "14.2" "-0.9" "0"  "0"  "0" 
## [5,] "3"  "Canberra" "19"   "0.4"  "0"  "0"  "0" 
## [6,] "4"  "Canberra" "16.7" "0.1"  "0"  "0"  "0"
tttrain1=tttt[tttt[,7]>0.5,]  
head(tttrain1)
##      [,1] [,2]       [,3]   [,4]   [,5]  [,6]   [,7]
## [1,] "4"  "Canberra" "9.7"  "-2.3" "0"   "1.4"  "1" 
## [2,] "4"  "Canberra" "11"   "-1.1" "0.2" "19.2" "1" 
## [3,] "2"  "Canberra" "20.4" "15.1" "0"   "18.8" "1" 
## [4,] "2"  "Canberra" "25.8" "17.2" "0"   "4"    "1" 
## [5,] "3"  "Canberra" "19.8" "7.1"  "2"   "5.2"  "1" 
## [6,] "2"  "Canberra" "29.9" "10.1" "0"   "5.4"  "1"
tdel=table(response[train])    
tdel 
## 
##   0   1 
## 180  39
tdel=tdel/sum(tdel)            
tdel    
## 
##         0         1 
## 0.8219178 0.1780822
ts0=table(tttrain0[,1])
ts0
## 
##  1  2  3  4  5 
##  1 46 55 51 27
ts0=ts0/sum(ts0)
ts0 
## 
##           1           2           3           4           5 
## 0.005555556 0.255555556 0.305555556 0.283333333 0.150000000
ts1=table(tttrain1[,1])
ts1=ts1/sum(ts1)
ts1  
## 
##          1          2          3          4          5 
## 0.05128205 0.48717949 0.12820513 0.20512821 0.12820513
tc0=table(tttrain0[,2])
tc0
## 
## Canberra 
##      180
tc0=tc0/sum(tc0)
tc0 
## 
## Canberra 
##        1
tc1=table(tttrain1[,2])
tc1=tc1/sum(tc1)
tc1   
## 
## Canberra 
##        1
td0=table(tttrain0[,3])
head(td0)
## 
## 10.7 11.1 11.3 11.5 11.6 11.7 
##    1    1    1    1    3    1
td0=td0/sum(td0)
head(tc0)
## 
## Canberra 
##        1
td1=table(tttrain1[,3])
td1=td1/sum(td1)
head(td1)
## 
##       10.9         11       15.9       16.3       17.2       17.5 
## 0.02564103 0.02564103 0.02564103 0.05128205 0.02564103 0.02564103
to0=table(tttrain0[,4])
head(to0)
## 
## -0.1 -0.3 -0.5 -0.6 -0.9   -1 
##    2    1    1    1    3    1
to0=to0/sum(to0)
head(to0) 
## 
##        -0.1        -0.3        -0.5        -0.6        -0.9          -1 
## 0.011111111 0.005555556 0.005555556 0.005555556 0.016666667 0.005555556
to1=table(tttrain1[,4])
to1=to1/sum(to1)
head(to1)   
## 
##       -0.3       -1.1       -1.9       -2.3        0.5        1.8 
## 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103
tc0=table(tttrain0[,5])
head(tc0)
## 
##   0 0.2 0.4 0.6 0.8   1 
## 140   8   2   3   1   2
tc0=tc0/sum(tc0)
head(tc0) 
## 
##           0         0.2         0.4         0.6         0.8           1 
## 0.777777778 0.044444444 0.011111111 0.016666667 0.005555556 0.011111111
tc1=table(tttrain1[,5])
tc1=tc1/sum(tc1)
head(tc1)
## 
##          0        0.2        0.4        0.6        1.6        1.8 
## 0.56410256 0.05128205 0.05128205 0.02564103 0.02564103 0.02564103
tw0=table(tttrain0[,6])
head(tw0)
## 
##   0 0.2 0.4 0.6 0.8   1 
## 161   6   2   5   3   3
tw0=tw0/sum(tw0)
head(tw0) 
## 
##          0        0.2        0.4        0.6        0.8          1 
## 0.89444444 0.03333333 0.01111111 0.02777778 0.01666667 0.01666667
tw1=table(tttrain1[,6])
tw1=tc1/sum(tw1)
head(tw1)
## 
##            0          0.2          0.4          0.6          1.6          1.8 
## 0.0144641683 0.0013149244 0.0013149244 0.0006574622 0.0006574622 0.0006574622
tdw0=table(tttrain0[,7])
tdw0
## 
##   0 
## 180
tdw0=tdw0/sum(tdw0)
tdw0 
## 
## 0 
## 1
tdw1=table(tttrain1[,7])
tdw1=tdw1/sum(tdw1)
tdw1
## 
## 1 
## 1
tt=cbind(weath$Datef[train],weath$Location[train],weath$dest[train],weath$MaxTemp[train],weath$MinTemp[train],weath$Rainfall[train],weath$RISK_MM[train],response[train])
head(tt,10)
##       [,1] [,2]       [,3]   [,4]   [,5] [,6]  [,7]
##  [1,] "5"  "Canberra" "23.3" "5.1"  "0"  "0"   "0" 
##  [2,] "3"  "Canberra" "20.7" "5.4"  "0"  "0"   "0" 
##  [3,] "3"  "Canberra" "27.4" "9.5"  "3"  "0"   "0" 
##  [4,] "4"  "Canberra" "14.2" "-0.9" "0"  "0"   "0" 
##  [5,] "4"  "Canberra" "9.7"  "-2.3" "0"  "1.4" "1" 
##  [6,] "3"  "Canberra" "19"   "0.4"  "0"  "0"   "0" 
##  [7,] "4"  "Canberra" "16.7" "0.1"  "0"  "0"   "0" 
##  [8,] "2"  "Canberra" "27.8" "10.3" "0"  "0"   "0" 
##  [9,] "4"  "Canberra" "12.8" "2.3"  "0"  "0"   "0" 
## [10,] "5"  "Canberra" "18.7" "3.2"  "0"  "0"   "0"
ttcol1<-tt[,1]
head(ttcol1)
## [1] "5" "3" "3" "4" "4" "3"
ttcol1[1]
## [1] "5"
ts0[ttcol1[1]]
##    5 
## 0.15
ts0  
## 
##           1           2           3           4           5 
## 0.005555556 0.255555556 0.305555556 0.283333333 0.150000000
p0=ts0[tt[,1]]*tc0[tt[,2]]*td0[tt[,3]]*to0[tt[,4]]*tw0[tt[,5]]*tdw0[tt[,6]]
head(p0)
## 
## 5 3 3 4 4 3 
## 
p1=ts1[tt[,1]]*tc1[tt[,2]]*td1[tt[,3]]*to1[tt[,4]]*tw1[tt[,5]]*tdw1[tt[,6]]
head(p1)
## 
## 5 3 3 4 4 3 
## 
gg=(p1*tdel[2])/(p1*tdel[2]+p0*tdel[1])  
head(gg,10)
## 
## 5 3 3 4 4 3 4 2 4 5 
## 
bb=cbind(gg,actual=response[-train])
## Warning in cbind(gg, actual = response[-train]): number of rows of result is not
## a multiple of vector length (arg 2)
head(bb,10)
##   gg actual
## 5 NA      1
## 3 NA      1
## 3 NA      1
## 4 NA      0
## 4 NA      0
## 3 NA      0
## 4 NA      0
## 2 NA      0
## 4 NA      0
## 5 NA      0
bb1=bb[order(gg,decreasing=TRUE),] 
head(bb1,10)
##   gg actual
## 5 NA      1
## 3 NA      1
## 3 NA      1
## 4 NA      0
## 4 NA      0
## 3 NA      0
## 4 NA      0
## 2 NA      0
## 4 NA      0
## 5 NA      0
xbar=mean(response[-train])
xbar
## [1] 0.1836735
n2 
## [1] 147
axis=dim(n2)                          
ax=dim(n2)                             
ay=dim(n2)                             
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]                         
for (i in 2:n2) {
  axis[i]=i                            
  ax[i]=xbar*i                           
  ay[i]=ay[i-1]+bb1[i,2]   
  if(i<11) print(c(axis[i], ax[i], ay[i]))
}
## [1] 2.0000000 0.3673469 2.0000000
## [1] 3.0000000 0.5510204 3.0000000
## [1] 4.0000000 0.7346939 3.0000000
## [1] 5.0000000 0.9183673 3.0000000
## [1] 6.000000 1.102041 3.000000
## [1] 7.000000 1.285714 3.000000
## [1] 8.000000 1.469388 3.000000
## [1] 9.000000 1.653061 3.000000
## [1] 10.000000  1.836735  3.000000
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
## Warning in cbind(bb1[, 1], bb1[, 2], ay, ax): number of rows of result is not a
## multiple of vector length (arg 3)
dim(aaa)
## [1] 219   4
head(aaa,10)
##        ay        ax
## 5 NA 1  1 0.1836735
## 3 NA 1  2 0.3673469
## 3 NA 1  3 0.5510204
## 4 NA 0  3 0.7346939
## 4 NA 0  3 0.9183673
## 3 NA 0  3 1.1020408
## 4 NA 0  3 1.2857143
## 2 NA 0  3 1.4693878
## 4 NA 0  3 1.6530612
## 5 NA 0  3 1.8367347
aaa[1:10,] 
##        ay        ax
## 5 NA 1  1 0.1836735
## 3 NA 1  2 0.3673469
## 3 NA 1  3 0.5510204
## 4 NA 0  3 0.7346939
## 4 NA 0  3 0.9183673
## 3 NA 0  3 1.1020408
## 4 NA 0  3 1.2857143
## 2 NA 0  3 1.4693878
## 4 NA 0  3 1.6530612
## 5 NA 0  3 1.8367347
plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

## Chapter 12 Discriminant Analysis

updatedweather <- read_csv("updatedweather.csv")
## Warning: Duplicated column names deduplicated: 'Date' => 'Date_1' [2]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_character(),
##   Location = col_character(),
##   WindGustDir = col_character(),
##   WindDir9am = col_character(),
##   WindDir3pm = col_character(),
##   RainToday = col_character(),
##   RainTomorrow = col_character()
## )
## See spec(...) for full column specifications.
upweath=data.frame(updatedweather)
head(upweath)
##        Date Date_1 Location MinTemp MaxTemp Rainfall Evaporation Sunshine
## 1 11/1/2007  39387 Canberra     8.0    24.3      0.0         3.4      6.3
## 2 11/2/2007  39388 Canberra    14.0    26.9      3.6         4.4      9.7
## 3 11/3/2007  39389 Canberra    13.7    23.4      3.6         5.8      3.3
## 4 11/4/2007  39390 Canberra    13.3    15.5     39.8         7.2      9.1
## 5 11/5/2007  39391 Canberra     7.6    16.1      2.8         5.6     10.6
## 6 11/6/2007  39392 Canberra     6.2    16.9      0.0         5.8      8.2
##   WindGustDir WindGustSpeed WindDir9am WindDir3pm WindSpeed9am WindSpeed3pm
## 1          NW            30         SW         NW            6           20
## 2         ENE            39          E          W            4           17
## 3          NW            85          N        NNE            6            6
## 4          NW            54        WNW          W           30           24
## 5         SSE            50        SSE        ESE           20           28
## 6          SE            44         SE          E           20           24
##   Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am
## 1          68          29      1019.7      1015.0        7        7    14.4
## 2          80          36      1012.4      1008.4        5        3    17.5
## 3          82          69      1009.5      1007.2        8        7    15.4
## 4          62          56      1005.5      1007.0        2        7    13.5
## 5          68          49      1018.3      1018.5        7        7    11.1
## 6          70          57      1023.8      1021.7        7        5    10.9
##   Temp3pm RainToday RISK_MM RainTomorrow
## 1    23.6        No     3.6          Yes
## 2    25.7       Yes     3.6          Yes
## 3    20.2       Yes    39.8          Yes
## 4    14.1       Yes     2.8          Yes
## 5    15.4       Yes     0.0           No
## 6    14.8        No     0.2           No
upweath1=updatedweather[,c("MaxTemp","MinTemp","Rainfall","RainTomorrow")]
head(upweath1)
## # A tibble: 6 x 4
##   MaxTemp MinTemp Rainfall RainTomorrow
##     <dbl>   <dbl>    <dbl> <chr>       
## 1    24.3     8        0   Yes         
## 2    26.9    14        3.6 Yes         
## 3    23.4    13.7      3.6 Yes         
## 4    15.5    13.3     39.8 Yes         
## 5    16.1     7.6      2.8 No          
## 6    16.9     6.2      0   No
summary(upweath1) 
##     MaxTemp         MinTemp          Rainfall      RainTomorrow      
##  Min.   : 7.60   Min.   :-5.300   Min.   : 0.000   Length:366        
##  1st Qu.:15.03   1st Qu.: 2.300   1st Qu.: 0.000   Class :character  
##  Median :19.65   Median : 7.450   Median : 0.000   Mode  :character  
##  Mean   :20.55   Mean   : 7.266   Mean   : 1.428                     
##  3rd Qu.:25.50   3rd Qu.:12.500   3rd Qu.: 0.200                     
##  Max.   :35.80   Max.   :20.900   Max.   :39.800
m1=lda(RainTomorrow~.,upweath1)
m1
## Call:
## lda(RainTomorrow ~ ., data = upweath1)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.8196721 0.1803279 
## 
## Group means:
##      MaxTemp   MinTemp Rainfall
## No  20.39600  6.607333 1.164000
## Yes 21.25152 10.257576 2.630303
## 
## Coefficients of linear discriminants:
##                  LD1
## MaxTemp  -0.14390495
## MinTemp   0.25130654
## Rainfall  0.02028501
predict(m1,newdata=data.frame(MaxTemp=16.9,MinTemp=6.2,Rainfall=0))
## $class
## [1] No
## Levels: No Yes
## 
## $posterior
##          No       Yes
## 1 0.8238795 0.1761205
## 
## $x
##         LD1
## 1 0.2285313
m2=qda(RainTomorrow~.,upweath1)
m2
## Call:
## qda(RainTomorrow ~ ., data = upweath1)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.8196721 0.1803279 
## 
## Group means:
##      MaxTemp   MinTemp Rainfall
## No  20.39600  6.607333 1.164000
## Yes 21.25152 10.257576 2.630303
predict(m2,newdata=data.frame(MaxTemp=16.9,MinTemp=6.2,Rainfall=0))
## $class
## [1] No
## Levels: No Yes
## 
## $posterior
##         No      Yes
## 1 0.887376 0.112624
n=366
nt=330
neval=n-nt
rep=100
errlin=dim(rep)
for (k in 1:rep) {
  train=sample(1:n,nt)
  m1=lda(RainTomorrow~.,upweath1[train,])
  predict(m1,upweath1[-train,])$class
  tablin=table(upweath1$RainTomorrow[-train],predict(m1,upweath1[-train,])$class)
  errlin[k]=(neval-sum(diag(tablin)))/neval
}
merrlin=mean(errlin)
merrlin
## [1] 0.1808333

Conclusion

Since the dates were in character form, they were simplified into a numerical form rather date form. This was done directly in the csv prior to uploading it into R. Rainfall was categorized as either having rained or have not rained. Since the data dealt with bayesian analysis, the variables of destination, maximum temperature, minimum temperature,rainfall, and Risk MM were conditionally looked at find the probability of it raining tomorrow. It was determined that the probability of it raining tomorrow was 18.367&. For discriminant analysis, there was a total of 366 cases for this data set. The linear discriminant analysis looked a 330 cases out of 366, and classified the sample of 36 days. The misclassification rate was found to be 18.083%.