First step is to download the data to a local file system

getwd()
## [1] "D:/rlang"
list.files()
##  [1] "Assigment 2- script.R"          "assignment 4.pdf"              
##  [3] "baltcam1"                       "best.R"                        
##  [5] "chicagodata"                    "data"                          
##  [7] "decision_tree.png"              "hospital-data.csv"             
##  [9] "Hospital_Revised_Flatfiles.pdf" "midterm exam"                  
## [11] "midtermdata"                    "myfirstrprogram.R"             
## [13] "outcome-of-care-measures.csv"   "PollingData_Imputed.csv"       
## [15] "pollingdatanalaysis.html"       "pollingdatanalaysis.Rmd"       
## [17] "projectwork.Rmd"                "quality.csv"                   
## [19] "rankall.R"                      "rpackages"                     
## [21] "Rplot.pdf"                      "Rplot01.png"                   
## [23] "rsconnect"                      "rtest.R"                       
## [25] "run_analysis-dd.R"              "run_analysis1.R"               
## [27] "run_analysis2.R"                "secTidySet.txt"                
## [29] "specdata"                       "survey.csv"                    
## [31] "temp1.html"                     "temp1.Rmd"                     
## [33] "test.html"                      "test.Rmd"                      
## [35] "testindoc.html"                 "testindoc.Rmd"                 
## [37] "testing123.html"                "testing123.Rmd"                
## [39] "testing456.html"                "testing456.Rmd"                
## [41] "testing4567.html"               "testing4567.Rmd"
tryme = read.table("PollingData_Imputed.csv", sep = ",", header = TRUE)

second step is to evalute the data

head(tryme)
##     State Year Rasmussen SurveyUSA DiffCount PropR Republican
## 1 Alabama 2004        11        18         5     1          1
## 2 Alabama 2008        21        25         5     1          1
## 3  Alaska 2004        19        21         1     1          1
## 4  Alaska 2008        16        18         6     1          1
## 5 Arizona 2004         5        15         8     1          1
## 6 Arizona 2008         5         3         9     1          1
names(tryme) = c("St", "Yr", "Ras", "SurvUSA", "DC", "PR", "Rep")
head(tryme)
##        St   Yr Ras SurvUSA DC PR Rep
## 1 Alabama 2004  11      18  5  1   1
## 2 Alabama 2008  21      25  5  1   1
## 3  Alaska 2004  19      21  1  1   1
## 4  Alaska 2008  16      18  6  1   1
## 5 Arizona 2004   5      15  8  1   1
## 6 Arizona 2008   5       3  9  1   1

Interpretation of Output

There is no favorable coefficient that has direct relations Additionally the values of St have to be changed

class(tryme$St)
## [1] "factor"
tryme$St = as.numeric(tryme$St)
head(tryme)
##   St   Yr Ras SurvUSA DC PR Rep
## 1  1 2004  11      18  5  1   1
## 2  1 2008  21      25  5  1   1
## 3  2 2004  19      21  1  1   1
## 4  2 2008  16      18  6  1   1
## 5  3 2004   5      15  8  1   1
## 6  3 2008   5       3  9  1   1

Install Additional Packages

#install.packages("caTools")
library(caTools)

dividing the dataa into real data and test data

#set.seed(88)
#split = sample.split(tryme$Rep, SplitRatio = 0.75)
#head(split)
#pollingtrain = subset(tryme, split == TRUE)
#pollingtest = subset(tryme, split == FALSE)
#head(pollingtrain)
#head(pollingtest)

Creating Logistic Regression

#model = glm(Rep ~ St + Yr + Ras + SurvUSA +DC + PR, data = pollingtrain, family = binomial)
#model

Interpretation of Output

There is no favorable coefficient that has direct relations

Prediction on Test Data

#predict(model, pollingtest, type = "response")
       5            6            8           10           18           20           28           34           37 

1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 1.000000e+00 38 39 42 44 45 46 49 54 56 9.998519e-01 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 62 64 66 68 71 72 75 76 86 2.220446e-16 4.337903e-11 2.220446e-16 1.000000e+00 1.996228e-08 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 99 102 107 114 115 130 136 143 145 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00

Need to understand this

Create a Decision Tree Model

# install the new package "party"
#install.packages("party")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich

Creat an input data frame

input.data = tryme[c(1:100),]

Creat Chart Name

png(file = "decision_tree.png")

Create the Tree

library(party)
png(file = "decision_tree.png")
output.tree = ctree(Rep ~ St + Yr + Ras + SurvUSA + DC + PR, data = tryme)
plot(output.tree)
dev.off()
## png 
##   2