In the previous topic we discussed US presidents inauguration speech statistics by estimating means of variables. This time we use additional data for the purpose of classification. Some interesting results will be obtained in the end.
Here are some notes about data origins. OIL - Annual Average Domestic Crude Oil Prices (in $/Barrel) Inflation Adjusted to August 2016 (http://inflationdata.com/Inflation/Inflation_Rate/Historical_Oil_Prices_Table.asp) Inflation rate (%) - http://www.multpl.com/inflation/table, Unemployment rate (%)- http://www.multpl.com/unemployment/table, GDP growth (%)-http://www.multpl.com/us-gdp-growth-rate/table/by-year AWH (Average Weekly Hours) - https://fred.stlouisfed.org/series/PRS84006021 SSN (Average sun spot number by year) - http://www.sidc.be/silso/datafiles
uspres<-read.table(file="uspres.txt")
uspres
## Year Party Old Height Inflation Unemployment GDP SSN AWH
## Trump 2017 Rep 70 184 2.07 4.7 2.94 39.9 0.4
## Obama2 2013 Dem 52 185 1.59 8.0 4.31 94.0 0.2
## Obama1 2009 Dem 48 185 0.03 7.8 0.11 4.8 0.8
## Bush-2 2005 Rep 59 182 2.97 5.3 6.52 45.8 0.1
## Bush-1 2001 Rep 55 182 3.73 4.2 2.19 170.4 1.5
## Clinton2 1997 Dem 51 188 3.04 5.3 6.05 28.9 0.3
## Clinton1 1993 Dem 47 188 3.26 7.3 5.00 76.1 0.6
## Bush 1989 Rep 65 188 4.67 5.4 6.48 211.1 0.5
## Reagan2 1985 Rep 74 185 3.53 7.5 7.37 20.6 0.2
## Reagan1 1981 Rep 70 185 11.83 7.5 9.69 198.9 0.5
## Carter 1977 Dem 53 177 5.22 7.5 11.88 39.3 0.9
## Ford 1974 Rep 61 183 9.39 5.1 8.38 49.2 2.1
## Nixon2 1973 Rep 60 182 3.65 4.9 11.04 54.1 0.3
## Nixon1 1969 Rep 56 182 4.40 3.4 7.28 149.4 0.8
## Johnson 1965 Dem 55 192 0.97 4.9 10.70 22.0 0.4
## Kennedy 1961 Dem 44 183 1.71 6.6 7.48 76.4 0.1
## Eisenhower2 1957 Rep 67 179 2.99 4.2 3.12 269.3 2.1
## Eisenhower1 1953 Rep 63 179 0.38 2.9 1.39 20.1 1.0
## Truman2 1949 Dem 65 175 1.27 4.3 -3.46 190.7 1.0
## Truman1 1945 Dem 61 175 2.30 3.0 76.35 55.3 0.5
## OIL
## Trump 34.13
## Obama2 94.25
## Obama1 59.93
## Bush-2 61.65
## Bush-1 31.30
## Clinton2 27.98
## Clinton1 27.94
## Bush 35.60
## Reagan2 60.27
## Reagan1 94.83
## Carter 57.20
## Ford 45.60
## Nixon2 25.56
## Nixon1 21.81
## Johnson 23.00
## Kennedy 22.96
## Eisenhower2 26.90
## Eisenhower1 26.23
## Truman2 28.02
## Truman1 19.80
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
uspres %>% group_by(Party) %>%
summarise(MUNEMPL=mean(Unemployment), MINFL=mean(Inflation), MGDPGR=mean(GDP), MOLD=mean(Old), MAWH=mean(AWH), MOIL=mean(OIL))
## # A tibble: 2 <U+00D7> 7
## Party MUNEMPL MINFL MGDPGR MOLD MAWH MOIL
## <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dem 6.077778 2.154444 13.157778 52.88889 0.5333333 40.12000
## 2 Rep 5.009091 4.510000 6.036364 63.63636 0.8636364 42.17091
library(psych)
cor(uspres[,c(3:10)])
## Old Height Inflation Unemployment GDP
## Old 1.0000000 -0.21368424 0.32117920 -0.2659879 0.04638720
## Height -0.2136842 1.00000000 0.06247463 0.4048385 -0.33619393
## Inflation 0.3211792 0.06247463 1.00000000 0.1967389 0.02926774
## Unemployment -0.2659879 0.40483852 0.19673893 1.0000000 -0.28686410
## GDP 0.0463872 -0.33619393 0.02926774 -0.2868641 1.00000000
## SSN 0.3051699 -0.18805315 0.31822197 -0.1777013 -0.15696216
## AWH 0.1540705 -0.34920910 0.25736450 -0.3365763 -0.14550895
## OIL 0.1377781 0.14515070 0.40579317 0.7133486 -0.18095438
## SSN AWH OIL
## Old 0.305169882 0.1540705 0.137778104
## Height -0.188053151 -0.3492091 0.145150699
## Inflation 0.318221965 0.2573645 0.405793167
## Unemployment -0.177701277 -0.3365763 0.713348628
## GDP -0.156962164 -0.1455090 -0.180954377
## SSN 1.000000000 0.4087765 -0.002851315
## AWH 0.408776516 1.0000000 -0.191756550
## OIL -0.002851315 -0.1917566 1.000000000
pairs.panels(uspres[,c(3:10)],lm = T)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(uspres, aes(Old,Unemployment,fill=Party)) + geom_jitter(height = 0 ) + geom_smooth(method = lm) +facet_wrap(~ Party)
ggplot(uspres, aes(Old,Inflation,fill=Party)) + geom_jitter(height = 0) + geom_smooth( method = lm) +facet_wrap(~ Party)
ggplot(uspres, aes(Old,GDP,fill=Party)) + geom_jitter(height = 0) + geom_smooth( method = lm) +facet_wrap(~ Party)
ggplot(uspres, aes(Old,AWH,fill=Party)) + geom_jitter(height = 0) + geom_smooth( method = lm) +facet_wrap(~ Party)
ggplot(uspres, aes(Old,OIL,fill=Party)) + geom_jitter(height = 0) + geom_smooth( method = lm) +facet_wrap(~ Party)
See https://en.wikipedia.org/wiki/Linear_regression
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
fit.lm<-lm(Unemployment~Party*Old,data=uspres)
summary(fit.lm)
##
## Call:
## lm(formula = Unemployment ~ Party * Old, data = uspres)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0129 -1.1557 0.3127 0.8144 1.7505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.2937 3.4568 4.714 0.000234 ***
## PartyRep -20.9069 5.3238 -3.927 0.001203 **
## Old -0.1932 0.0649 -2.976 0.008909 **
## PartyRep:Old 0.3444 0.0907 3.797 0.001583 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.229 on 16 degrees of freedom
## Multiple R-squared: 0.5334, Adjusted R-squared: 0.446
## F-statistic: 6.098 on 3 and 16 DF, p-value: 0.00573
anova(fit.lm)
## Analysis of Variance Table
##
## Response: Unemployment
## Df Sum Sq Mean Sq F value Pr(>F)
## Party 1 5.6534 5.6534 3.740 0.071013 .
## Old 1 0.2086 0.2086 0.138 0.715152
## Party:Old 1 21.7907 21.7907 14.416 0.001583 **
## Residuals 16 24.1853 1.5116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveragePlots(fit.lm)
par(mfrow=c(2,2))
plot(fit.lm)
fit.lm.2<-lm(Unemployment~OIL+Party*Old,data=uspres)
summary(fit.lm.2)
##
## Call:
## lm(formula = Unemployment ~ OIL + Party * Old, data = uspres)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3632 -0.4165 -0.1047 0.6023 0.8999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.974485 2.074163 6.255 1.54e-05 ***
## OIL 0.043681 0.007586 5.758 3.78e-05 ***
## PartyRep -14.766106 3.248621 -4.545 0.000387 ***
## Old -0.163535 0.037759 -4.331 0.000594 ***
## PartyRep:Old 0.241457 0.055248 4.370 0.000548 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7087 on 15 degrees of freedom
## Multiple R-squared: 0.8547, Adjusted R-squared: 0.8159
## F-statistic: 22.06 on 4 and 15 DF, p-value: 3.865e-06
anova(fit.lm,fit.lm.2)
## Analysis of Variance Table
##
## Model 1: Unemployment ~ Party * Old
## Model 2: Unemployment ~ OIL + Party * Old
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 16 24.1853
## 2 15 7.5328 1 16.652 33.16 3.778e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveragePlots(fit.lm.2)
plot(fit.lm.2)
See https://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/Na%C3%AFve_Bayes.
library(e1071)
model <- naiveBayes(Party ~ Inflation + Unemployment + GDP + SSN + OIL, data = uspres[-1,]) #excludeing D.Trump
pred_party<-predict(model,uspres) #including D.Trump
table(pred_party,uspres$Party)#confusion matrix
##
## pred_party Dem Rep
## Dem 6 0
## Rep 3 11
prd<-predict(model, uspres, type = "raw")
prd<-as.data.frame(prd)
PartyNB<-ifelse(prd$Dem>0.5,"Dem","Rep")
cbind(prd,uspres[,c(1,2)])#posterior probabilities of classification combined with initial classes (Parties)
## Dem Rep Year Party
## Trump 4.409815e-01 5.590185e-01 2017 Rep
## Obama2 6.826034e-01 3.173966e-01 2013 Dem
## Obama1 8.826921e-01 1.173079e-01 2009 Dem
## Bush-2 2.738374e-01 7.261626e-01 2005 Rep
## Bush-1 5.680432e-02 9.431957e-01 2001 Rep
## Clinton2 2.954665e-01 7.045335e-01 1997 Dem
## Clinton1 4.591057e-01 5.408943e-01 1993 Dem
## Bush 4.272937e-03 9.957271e-01 1989 Rep
## Reagan2 4.489485e-01 5.510515e-01 1985 Rep
## Reagan1 2.478126e-09 1.000000e+00 1981 Rep
## Carter 4.073108e-01 5.926892e-01 1977 Dem
## Ford 1.533201e-05 9.999847e-01 1974 Rep
## Nixon2 4.279054e-01 5.720946e-01 1973 Rep
## Nixon1 2.521619e-02 9.747838e-01 1969 Rep
## Johnson 5.916684e-01 4.083316e-01 1965 Dem
## Kennedy 5.099330e-01 4.900670e-01 1961 Dem
## Eisenhower2 1.616249e-03 9.983838e-01 1957 Rep
## Eisenhower1 4.809695e-01 5.190305e-01 1953 Rep
## Truman2 7.736332e-01 2.263668e-01 1949 Dem
## Truman1 1.000000e+00 2.154283e-103 1945 Dem
sum(ifelse(PartyNB==uspres$Party,1,0))/length(uspres$Party)#classification accuracy
## [1] 0.85
See https://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM.
model.svm<-svm(Party~Inflation+Unemployment+GDP+SSN+OIL,data = uspres[-1,],kernel="linear",type="C-classification",scale=T,probability=T) #excluding D.Trump
pred_svm<-predict(model.svm, uspres, probability=T) #including D.Trump
cbind(attr(pred_svm,which = "probabilities"),uspres[,c(1,2)])
## Dem Rep Year Party
## Trump 0.4594530 0.5405470 2017 Rep
## Obama2 0.7052414 0.2947586 2013 Dem
## Obama1 0.8153858 0.1846142 2009 Dem
## Bush-2 0.4142624 0.5857376 2005 Rep
## Bush-1 0.3035837 0.6964163 2001 Rep
## Clinton2 0.5000000 0.5000000 1997 Dem
## Clinton1 0.7051539 0.2948461 1993 Dem
## Bush 0.3856057 0.6143943 1989 Rep
## Reagan2 0.6406976 0.3593024 1985 Rep
## Reagan1 0.1242196 0.8757804 1981 Rep
## Carter 0.5697878 0.4302122 1977 Dem
## Ford 0.1291956 0.8708044 1974 Rep
## Nixon2 0.4510690 0.5489310 1973 Rep
## Nixon1 0.2393437 0.7606563 1969 Rep
## Johnson 0.6289383 0.3710617 1965 Dem
## Kennedy 0.7436137 0.2563863 1961 Dem
## Eisenhower2 0.3583046 0.6416954 1957 Rep
## Eisenhower1 0.3718701 0.6281299 1953 Rep
## Truman2 0.4423003 0.5576997 1949 Dem
## Truman1 0.7052413 0.2947587 1945 Dem
#posterior probabilities of classification combined with initial classes (Parties)
table(pred_svm,uspres$Party)#confusion matrix
##
## pred_svm Dem Rep
## Dem 8 1
## Rep 1 10
sum(ifelse(pred_svm==uspres$Party,1,0))/length(uspres$Party)#classification accuracy
## [1] 0.9
plot(model.svm,uspres,Inflation~Unemployment,col=c("blue","red"))
See https://en.wikipedia.org/wiki/Linear_discriminant_analysis
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(klaR)
fit.lda<-lda(Party ~ Inflation + Unemployment + GDP + SSN + OIL, data = uspres[-1,])#excluding D.Trump
fit.lda
## Call:
## lda(Party ~ Inflation + Unemployment + GDP + SSN + OIL, data = uspres[-1,
## ])
##
## Prior probabilities of groups:
## Dem Rep
## 0.4736842 0.5263158
##
## Group means:
## Inflation Unemployment GDP SSN OIL
## Dem 2.154444 6.077778 13.15778 65.27778 40.120
## Rep 4.754000 5.040000 6.34600 118.89000 42.975
##
## Coefficients of linear discriminants:
## LD1
## Inflation 0.3210571806
## Unemployment -0.8928047991
## GDP -0.0455257540
## SSN -0.0003368387
## OIL 0.0288014243
plot(fit.lda)
prd.lda<-predict(fit.lda,uspres)#including D.Trump
cbind(prd.lda,uspres[,c(1,2)])#posterior probabilities of classification combined with initial classes (Parties)
## class posterior.Dem posterior.Rep LD1 Year Party
## Trump Rep 0.261652658 0.73834734 0.38026975 2017 Rep
## Obama2 Dem 0.886724621 0.11327538 -1.06914516 2013 Dem
## Obama1 Dem 0.987681403 0.01231860 -2.15864410 2009 Dem
## Bush-2 Rep 0.135778031 0.86422197 0.76118398 2005 Rep
## Bush-1 Rep 0.050511311 0.94948869 1.26830590 2001 Rep
## Clinton2 Dem 0.528509753 0.47149025 -0.15899629 1997 Dem
## Clinton1 Dem 0.976123439 0.02387656 -1.84322211 1993 Dem
## Bush Rep 0.248151346 0.75184865 0.41356520 1989 Rep
## Reagan2 Dem 0.891772066 0.10822793 -1.09314907 1985 Rep
## Reagan1 Rep 0.004710700 0.99528930 2.40132466 1981 Rep
## Carter Dem 0.830761401 0.16923860 -0.85060285 1977 Dem
## Ford Rep 0.004221995 0.99577801 2.45284603 1974 Rep
## Nixon2 Rep 0.397814715 0.60218528 0.08860921 1973 Rep
## Nixon1 Rep 0.020736787 0.97926321 1.69968006 1969 Rep
## Johnson Dem 0.821143246 0.17885675 -0.81926440 1965 Dem
## Kennedy Dem 0.981771970 0.01822803 -1.97233340 1961 Dem
## Eisenhower2 Rep 0.119810572 0.88018943 0.82834502 1957 Rep
## Eisenhower1 Rep 0.047901460 0.95209854 1.29443483 1953 Rep
## Truman2 Rep 0.199493817 0.80050618 0.54513877 1949 Dem
## Truman1 Dem 0.973219412 0.02678059 -1.78807626 1945 Dem
table(prd.lda$class,uspres$Party)#confusion matrix
##
## Dem Rep
## Dem 8 1
## Rep 1 10
sum(ifelse(prd.lda$class==uspres$Party,1,0))/length(uspres$Party)#classification accuracy
## [1] 0.9
partimat(Party ~ Inflation + Unemployment + GDP + SSN + OIL,data=uspres,method="lda")#partition plot for classification
lm(Unemployment~OIL+Party*Old) that are significant and relevant with correlations above.Party~Inflation+Unemployment+GDP+SSN+OIL) give us 85% precision for classification result. Both models (NB, SVM) predict Republican president with 57-59% probability versus 43-41% for Democrat given Inflation, Unemployment, GDP, SSN and OIL data for current D.Trump case.