First step is to download the data to a local file system

getwd()

## [1] "D:/rlang"

list.files()

##  [1] "Assigment 2- script.R"          "assignment 4.pdf"              
##  [3] "baltcam1"                       "best.R"                        
##  [5] "chicagodata"                    "data"                          
##  [7] "decision_tree.png"              "hospital-data.csv"             
##  [9] "Hospital_Revised_Flatfiles.pdf" "midterm exam"                  
## [11] "midtermdata"                    "myfirstrprogram.R"             
## [13] "outcome-of-care-measures.csv"   "PollingData_Imputed.csv"       
## [15] "pollingdatanalaysis.html"       "pollingdatanalaysis.Rmd"       
## [17] "projectwork.Rmd"                "quality.csv"                   
## [19] "rankall.R"                      "rpackages"                     
## [21] "Rplot.pdf"                      "Rplot01.png"                   
## [23] "rsconnect"                      "rtest.R"                       
## [25] "run_analysis-dd.R"              "run_analysis1.R"               
## [27] "run_analysis2.R"                "secTidySet.txt"                
## [29] "specdata"                       "survey.csv"                    
## [31] "temp1.html"                     "temp1.Rmd"                     
## [33] "test.html"                      "test.Rmd"                      
## [35] "testindoc.html"                 "testindoc.Rmd"                 
## [37] "testing123.html"                "testing123.Rmd"                
## [39] "testing456.html"                "testing456.Rmd"                
## [41] "testing4567.html"               "testing4567.Rmd"

tryme = read.table("PollingData_Imputed.csv", sep = ",", header = TRUE)

second step is to evalute the data

head(tryme)

##     State Year Rasmussen SurveyUSA DiffCount PropR Republican
## 1 Alabama 2004        11        18         5     1          1
## 2 Alabama 2008        21        25         5     1          1
## 3  Alaska 2004        19        21         1     1          1
## 4  Alaska 2008        16        18         6     1          1
## 5 Arizona 2004         5        15         8     1          1
## 6 Arizona 2008         5         3         9     1          1

names(tryme) = c("St", "Yr", "Ras", "SurvUSA", "DC", "PR", "Rep")
head(tryme)

##        St   Yr Ras SurvUSA DC PR Rep
## 1 Alabama 2004  11      18  5  1   1
## 2 Alabama 2008  21      25  5  1   1
## 3  Alaska 2004  19      21  1  1   1
## 4  Alaska 2008  16      18  6  1   1
## 5 Arizona 2004   5      15  8  1   1
## 6 Arizona 2008   5       3  9  1   1

Interpretation of Output

There is no favorable coefficient that has direct relations Additionally the values of St have to be changed

class(tryme$St)

## [1] "factor"

tryme$St = as.numeric(tryme$St)
head(tryme)

##   St   Yr Ras SurvUSA DC PR Rep
## 1  1 2004  11      18  5  1   1
## 2  1 2008  21      25  5  1   1
## 3  2 2004  19      21  1  1   1
## 4  2 2008  16      18  6  1   1
## 5  3 2004   5      15  8  1   1
## 6  3 2008   5       3  9  1   1

Install Additional Packages

#install.packages("caTools")
library(caTools)

dividing the dataa into real data and test data

#set.seed(88)
#split = sample.split(tryme$Rep, SplitRatio = 0.75)
#head(split)
#pollingtrain = subset(tryme, split == TRUE)
#pollingtest = subset(tryme, split == FALSE)
#head(pollingtrain)
#head(pollingtest)

Creating Logistic Regression

#model = glm(Rep ~ St + Yr + Ras + SurvUSA +DC + PR, data = pollingtrain, family = binomial)
#model

Interpretation of Output

There is no favorable coefficient that has direct relations

Prediction on Test Data

#predict(model, pollingtest, type = "response")

       5            6            8           10           18           20           28           34           37

1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 1.000000e+00 38 39 42 44 45 46 49 54 56 9.998519e-01 1.000000e+00 2.220446e-16 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 62 64 66 68 71 72 75 76 86 2.220446e-16 4.337903e-11 2.220446e-16 1.000000e+00 1.996228e-08 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 99 102 107 114 115 130 136 143 145 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 1.000000e+00

Need to understand this

Create a Decision Tree Model

# install the new package "party"
#install.packages("party")
library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

Creat an input data frame

input.data = tryme[c(1:100),]

Creat Chart Name

png(file = "decision_tree.png")

Create the Tree

library(party)
png(file = "decision_tree.png")
output.tree = ctree(Rep ~ St + Yr + Ras + SurvUSA + DC + PR, data = tryme)
plot(output.tree)
dev.off()

## png 
##   2

Mid Term Assignment for Business Analytiics

AsaeedSh

January 6, 2017

First step is to download the data to a local file system

second step is to evalute the data

Interpretation of Output

Install Additional Packages

dividing the dataa into real data and test data

Creating Logistic Regression

Interpretation of Output

Prediction on Test Data

Create a Decision Tree Model

Creat an input data frame

Creat Chart Name

Create the Tree