An R Markdown file is better than a Stata do file for adding notes to your syntax.
Vectors are a flexible object that can be used outside the context of a data frame.
Arrows assign information to an object (most languages use an equal sign). Note: assignment is based on the arrows dirction.
This: x <- 2 …is the same as: 2 -> x
# Assign a string
x <- "y"
x
## [1] "y"
# Assign answer as a "double" (float)
y <- 2+2
y
## [1] 4
# Assign the components of an object to another object
x <- y
x
## [1] 4
# Use the Concatnate function to assign multiple strings
x <- c("Apples", "Oranges", "Bananas", "Peaches", "Tomatoes", "Plums")
x
## [1] "Apples" "Oranges" "Bananas" "Peaches" "Tomatoes" "Plums"
x <- c(3, 7, 2, 4, 9, 7, 6, 8, 2)
x.min <- min(x)
x.max <- max(x)
?paste #ask R what the paste function is
paste("The min is", x.min, "and the max is", x.max)
## [1] "The min is 2 and the max is 9"
Loops can be used over vectors as well as variables. By interpreting strings instead of variables, loops can be used to compare variable names from different iterations of the same survey to see which variables are on both iterations.
Vectors do not need to be the same length in order to perform an operation on (unlike how variables need the same amount of rows in a spreadsheet-style dataframe)
fruit <- c("Apples", "Oranges", "Bananas", "Peaches", "Tomatoes", "Plums")
vegetables <- c("Carrots", "Radish", "Onions", "Tomatoes", "Squash", "Cabbage", "Seaweed")
ctr <- 1 #counter
match <- c() #initate vector (loop is appending to it)
for (i1 in fruit) {
for (i2 in vegetables) {
if (i1 == i2) {
match[[ctr]] <- i1
#the double brackets specify the position of component within the vector as opposed to the component itself
ctr <- ctr + 1
#count the loop iteration to define the component of the vector we will append to
}
}
}
### List number of fruits and vegetables and any matches
paste("Number of Fruits:", length(fruit))
## [1] "Number of Fruits: 6"
paste("Number of Vegetables:", length(vegetables))
## [1] "Number of Vegetables: 7"
paste("Matching in Fruit and Vegetables:", length(match), "which is", match)
## [1] "Matching in Fruit and Vegetables: 1 which is Tomatoes"
R has more tools to analyze a dataset and display the resulting output.
# Remember to use double backslashes or single forward slashes
setwd("C:\\Users\\Siebelm\\Documents\\4 FMG\\R Presentation")
# An CSV file is separated by commas
# Excel files are separated by semicolons
# Txt files are sometimes separated with a space (" ") or a tab ("/t"")
dataset <- read.csv("winequality-red.csv", sep=",")
colnames(dataset) #check variable names
## [1] "residual.sugar" "density" "alcohol" "quality"
head(dataset, 10) #check first 10 rows (six is default)
## residual.sugar density alcohol quality
## 1 1.9 0.9978 9.4 5
## 2 2.6 0.9968 9.8 5
## 3 2.3 0.9970 9.8 5
## 4 1.9 0.9980 9.8 6
## 5 1.9 0.9978 9.4 5
## 6 1.8 0.9978 9.4 5
## 7 1.6 0.9964 9.4 5
## 8 1.2 0.9946 10.0 7
## 9 2.0 0.9968 9.5 7
## 10 6.1 0.9978 10.5 5
# Rows and Columns
dim(dataset)
## [1] 1599 4
paste("Dataset has", dim(dataset)[1], "rows and", dim(dataset)[2], "columns.")
## [1] "Dataset has 1599 rows and 4 columns."
# The percentage of data missing
(sum(is.na(dataset))/(nrow(dataset)*ncol(dataset)))*100
## [1] 0
paste("The percentage of data missing",round((sum(is.na(dataset))/(nrow(dataset)*ncol(dataset)))*100, digits=2),"%")
## [1] "The percentage of data missing 0 %"
# Number of duplicate rows
nrow(dataset) - nrow(unique(dataset))
## [1] 264
paste("The number of duplicate rows are", nrow(dataset) - nrow(unique(dataset)))
## [1] "The number of duplicate rows are 264"
# R defines three types of variables: Numeric, Factor, and Character
names_numeric <- names(dataset)[which(sapply(dataset, is.numeric))]
paste("Number of Numeric variables are ", length(names_numeric)) # four
## [1] "Number of Numeric variables are 4"
names_fact <- names(dataset)[which(sapply(dataset, is.factor))]
paste("Number of Factor variables are ", length(names_fact)) # zero
## [1] "Number of Factor variables are 0"
names_char <- names(dataset)[which(sapply(dataset, is.character))]
paste("Number of Character variables are ", length(names_char)) # zero
## [1] "Number of Character variables are 0"
# R's dataframe hierarchy is: object "$" component within object "[" components within components "]"
# In other words: dataset "$" variable "[" observations "]"
# Or you can do: dataset "[" row "," column "]"
print("First five obs of quality:")
## [1] "First five obs of quality:"
dataset$quality[1:5]
## [1] 5 5 5 6 5
print("First five obs of residual sugar and alcohol percentage:")
## [1] "First five obs of residual sugar and alcohol percentage:"
dataset[1:5,1]
## [1] 1.9 2.6 2.3 1.9 1.9
dataset[1:5,3]
## [1] 9.4 9.8 9.8 9.8 9.4
R can simplify the output for regressions.
# Correlation
library(Hmisc)
rcorr(dataset$density, dataset$alcohol)[1]
## $r
## x y
## x 1.0000000 -0.4961796
## y -0.4961796 1.0000000
# Remove variable in a new dataset
dataset2 <- dataset #create copy of dataset
dataset2$density <- NULL #remove variable in new dataset (due to possible multicolinearity)
# OLS Regression
model.OLS <- lm(quality~., data = dataset2) # "." means all remaining varible
summary(model.OLS) #display saved results as in table output
##
## Call:
## lm(formula = quality ~ ., data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8412 -0.4080 -0.1698 0.5173 2.5877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.882059 0.176499 10.663 <2e-16 ***
## residual.sugar -0.003617 0.012618 -0.287 0.774
## alcohol 0.361043 0.016695 21.626 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7106 on 1596 degrees of freedom
## Multiple R-squared: 0.2268, Adjusted R-squared: 0.2258
## F-statistic: 234 on 2 and 1596 DF, p-value: < 2.2e-16
# Different displays of output
cat(paste("", "Output 1", "", sep="\n"))
##
## Output 1
summary(model.OLS)[4]
## $coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.882058583 0.17649950 10.6632517 1.082295e-25
## residual.sugar -0.003616748 0.01261840 -0.2866249 7.744368e-01
## alcohol 0.361043102 0.01669474 21.6261610 3.594183e-91
cat(paste("", "Output 2", "", sep="\n"))
##
## Output 2
round(summary(model.OLS)$coefficients ,2)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.88 0.18 10.66 0.00
## residual.sugar 0.00 0.01 -0.29 0.77
## alcohol 0.36 0.02 21.63 0.00
cat(paste("", "Output 3", "", sep="\n"))
##
## Output 3
round(coef(model.OLS),2)
## (Intercept) residual.sugar alcohol
## 1.88 0.00 0.36
Recoding variables is not necessarily as intuitive as in SAS.
# Recode as Binary
dataset2$quality.dum[dataset2$quality < 6] <- 0 #create variable quality.dum and assign it 0 if quality < 6
dataset2$quality.dum[dataset2$quality >= 6] <- 1 #assign 1 to newly created quality.dum if quality >= 6
dataset2$quality <- NULL #remove former variable
# Logistic Regression Odds Ratios
model.logit <- glm(quality.dum~., family = binomial(link = "logit"), data = dataset2)
summary(model.logit)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = dataset2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0972 -0.9310 0.3845 0.9640 2.0086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.69721 0.68659 -15.580 <2e-16 ***
## residual.sugar -0.03824 0.04216 -0.907 0.364
## alcohol 1.05853 0.06679 15.848 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2209.0 on 1598 degrees of freedom
## Residual deviance: 1864.2 on 1596 degrees of freedom
## AIC: 1870.2
##
## Number of Fisher Scoring iterations: 4
round(exp(coef(model.logit)),2)
## (Intercept) residual.sugar alcohol
## 0.00 0.96 2.88
R can use functions withing the code for predicted probabilities to define changes in the independent variables.
# Range of Alcohol Content
print("Alcohol Content")
## [1] "Alcohol Content"
summary(dataset2$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.40 9.50 10.20 10.42 11.10 14.90
# High Alcohol
highalcohol.pp <- data.frame(residual.sugar=mean(dataset2$residual.sugar), alcohol=quantile(dataset2$alcohol, c(.75)))
highalcohol.pp.pred <- predict(model.logit, highalcohol.pp, type='response')
# Low Alcohol
lowalcohol.pp <- data.frame(residual.sugar=mean(dataset2$residual.sugar), alcohol=quantile(dataset2$alcohol, c(.25)))
lowalcohol.pp.pred <- predict(model.logit, lowalcohol.pp, type='response')
# Predicted Probabilities
print("Probilities of Obtaining High Quality Wine")
## [1] "Probilities of Obtaining High Quality Wine"
paste("- Wine with high alcohol content: ", round(highalcohol.pp.pred,2))
## [1] "- Wine with high alcohol content: 0.72"
paste("- Wine with low alcohol content: ", round(lowalcohol.pp.pred,2))
## [1] "- Wine with low alcohol content: 0.32"
Here, I just want to demonstrate the benefits of being able to easily use multiple datasets at the same time.
### Train v Test ###
dataset2_random <- dataset2[sample(1:nrow(dataset2)), ]
train <- dataset2_random[1:799, ]
test <- dataset2_random[800:1599, ]
train.model <- glm(quality.dum~., family = binomial(link = "logit"), data = train)
summary(train.model)
##
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5099 -0.9154 0.2686 0.9737 2.0735
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.11495 0.98938 -11.23 <2e-16 ***
## residual.sugar -0.06534 0.07340 -0.89 0.373
## alcohol 1.09604 0.09658 11.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1107.10 on 798 degrees of freedom
## Residual deviance: 926.47 on 796 degrees of freedom
## AIC: 932.47
##
## Number of Fisher Scoring iterations: 4
library(ROCR)
pred.model <- predict.glm(train.model, test, type='response')
newpred <- prediction(pred.model, test$quality.dum)
newpred.performance <- performance(newpred, measure = "tpr",x.measure = "fpr")
plot(newpred.performance)
abline(a=0, b= 1)
AUC <- performance(newpred, measure = "auc")
AUC
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.7534198
##
##
## Slot "alpha.values":
## list()
print("The train dataset correctly predicted 76% true postives of the test dataset")
## [1] "The train dataset correctly predicted 76% true postives of the test dataset"