R Presentation

Notes

An R Markdown file is better than a Stata do file for adding notes to your syntax.

Vectors

Vectors are a flexible object that can be used outside the context of a data frame.

Arrows assign information to an object (most languages use an equal sign). Note: assignment is based on the arrows dirction.

This: x <- 2 …is the same as: 2 -> x

# Assign a string
x <- "y"
x

## [1] "y"

# Assign answer as a "double" (float)
y <- 2+2
y

## [1] 4

# Assign the components of an object to another object
x <- y
x

## [1] 4

# Use the Concatnate function to assign multiple strings
x <- c("Apples", "Oranges", "Bananas", "Peaches", "Tomatoes", "Plums")
x

## [1] "Apples"   "Oranges"  "Bananas"  "Peaches"  "Tomatoes" "Plums"

x <- c(3, 7, 2, 4, 9, 7, 6, 8, 2)
x.min <- min(x)
x.max <- max(x)
?paste #ask R what the paste function is
paste("The min is", x.min, "and the max is", x.max)

## [1] "The min is 2 and the max is 9"

Loop over Characters

Loops can be used over vectors as well as variables. By interpreting strings instead of variables, loops can be used to compare variable names from different iterations of the same survey to see which variables are on both iterations.

Vectors do not need to be the same length in order to perform an operation on (unlike how variables need the same amount of rows in a spreadsheet-style dataframe)

fruit <- c("Apples", "Oranges", "Bananas", "Peaches", "Tomatoes", "Plums")
vegetables <- c("Carrots", "Radish", "Onions", "Tomatoes", "Squash", "Cabbage", "Seaweed")

ctr <- 1 #counter
match <- c() #initate vector (loop is appending to it)
for (i1 in fruit) {
  for (i2 in vegetables) {
    if (i1 == i2) {
      match[[ctr]] <- i1 
      #the double brackets specify the position of component within the vector as opposed to the component itself
      ctr <- ctr + 1
      #count the loop iteration to define the component of the vector we will append to
    }
  }
}

### List number of fruits and vegetables and any matches
paste("Number of Fruits:", length(fruit))

## [1] "Number of Fruits: 6"

paste("Number of Vegetables:", length(vegetables))

## [1] "Number of Vegetables: 7"

paste("Matching in Fruit and Vegetables:", length(match), "which is", match)

## [1] "Matching in Fruit and Vegetables: 1 which is Tomatoes"

Dataset Cleaning

R has more tools to analyze a dataset and display the resulting output.

# Remember to use double backslashes or single forward slashes
setwd("C:\\Users\\Siebelm\\Documents\\4 FMG\\R Presentation")
# An CSV file is separated by commas
# Excel files are separated by semicolons
# Txt files are sometimes separated with a space (" ") or a tab ("/t"")
dataset <- read.csv("winequality-red.csv", sep=",")

colnames(dataset) #check variable names

## [1] "residual.sugar" "density"        "alcohol"        "quality"

head(dataset, 10) #check first 10 rows (six is default)

##    residual.sugar density alcohol quality
## 1             1.9  0.9978     9.4       5
## 2             2.6  0.9968     9.8       5
## 3             2.3  0.9970     9.8       5
## 4             1.9  0.9980     9.8       6
## 5             1.9  0.9978     9.4       5
## 6             1.8  0.9978     9.4       5
## 7             1.6  0.9964     9.4       5
## 8             1.2  0.9946    10.0       7
## 9             2.0  0.9968     9.5       7
## 10            6.1  0.9978    10.5       5

# Rows and Columns
dim(dataset)

## [1] 1599    4

paste("Dataset has", dim(dataset)[1], "rows and", dim(dataset)[2], "columns.")

## [1] "Dataset has 1599 rows and 4 columns."

# The percentage of data missing
(sum(is.na(dataset))/(nrow(dataset)*ncol(dataset)))*100

## [1] 0

paste("The percentage of data missing",round((sum(is.na(dataset))/(nrow(dataset)*ncol(dataset)))*100, digits=2),"%")

## [1] "The percentage of data missing 0 %"

# Number of duplicate rows
nrow(dataset) - nrow(unique(dataset))

## [1] 264

paste("The number of duplicate rows are", nrow(dataset) - nrow(unique(dataset)))

## [1] "The number of duplicate rows are 264"

# R defines three types of variables: Numeric, Factor, and Character
names_numeric <- names(dataset)[which(sapply(dataset, is.numeric))]
paste("Number of Numeric variables are ", length(names_numeric)) # four

## [1] "Number of Numeric variables are  4"

names_fact <- names(dataset)[which(sapply(dataset, is.factor))]
paste("Number of Factor variables are ", length(names_fact)) # zero

## [1] "Number of Factor variables are  0"

names_char <- names(dataset)[which(sapply(dataset, is.character))]
paste("Number of Character variables are ", length(names_char)) # zero

## [1] "Number of Character variables are  0"

# R's dataframe hierarchy is: object "$" component within object "[" components within components "]"
# In other words: dataset "$" variable "[" observations "]"
# Or you can do: dataset "[" row "," column "]"
print("First five obs of quality:")

## [1] "First five obs of quality:"

dataset$quality[1:5]

## [1] 5 5 5 6 5

print("First five obs of residual sugar and alcohol percentage:")

## [1] "First five obs of residual sugar and alcohol percentage:"

dataset[1:5,1]

## [1] 1.9 2.6 2.3 1.9 1.9

dataset[1:5,3]

## [1] 9.4 9.8 9.8 9.8 9.4

OLS Regression

R can simplify the output for regressions.

# Correlation
library(Hmisc)
rcorr(dataset$density, dataset$alcohol)[1]

## $r
##            x          y
## x  1.0000000 -0.4961796
## y -0.4961796  1.0000000

# Remove variable in a new dataset
dataset2 <- dataset #create copy of dataset
dataset2$density <- NULL #remove variable in new dataset (due to possible multicolinearity)

# OLS Regression
model.OLS <- lm(quality~., data = dataset2) # "." means all remaining varible
summary(model.OLS) #display saved results as in table output

## 
## Call:
## lm(formula = quality ~ ., data = dataset2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8412 -0.4080 -0.1698  0.5173  2.5877 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.882059   0.176499  10.663   <2e-16 ***
## residual.sugar -0.003617   0.012618  -0.287    0.774    
## alcohol         0.361043   0.016695  21.626   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7106 on 1596 degrees of freedom
## Multiple R-squared:  0.2268, Adjusted R-squared:  0.2258 
## F-statistic:   234 on 2 and 1596 DF,  p-value: < 2.2e-16

# Different displays of output
cat(paste("", "Output 1", "", sep="\n"))

## 
## Output 1

summary(model.OLS)[4]

## $coefficients
##                    Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)     1.882058583 0.17649950 10.6632517 1.082295e-25
## residual.sugar -0.003616748 0.01261840 -0.2866249 7.744368e-01
## alcohol         0.361043102 0.01669474 21.6261610 3.594183e-91

cat(paste("", "Output 2", "", sep="\n"))

## 
## Output 2

round(summary(model.OLS)$coefficients ,2)

##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)        1.88       0.18   10.66     0.00
## residual.sugar     0.00       0.01   -0.29     0.77
## alcohol            0.36       0.02   21.63     0.00

cat(paste("", "Output 3", "", sep="\n"))

## 
## Output 3

round(coef(model.OLS),2)

##    (Intercept) residual.sugar        alcohol 
##           1.88           0.00           0.36

Logistic Regression

Recoding variables is not necessarily as intuitive as in SAS.

# Recode as Binary
dataset2$quality.dum[dataset2$quality < 6] <- 0 #create variable quality.dum and assign it 0 if quality < 6
dataset2$quality.dum[dataset2$quality >= 6] <- 1 #assign 1 to newly created quality.dum if quality >= 6
dataset2$quality <- NULL #remove former variable

# Logistic Regression Odds Ratios
model.logit <- glm(quality.dum~., family = binomial(link = "logit"), data = dataset2) 
summary(model.logit)

## 
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"), 
##     data = dataset2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0972  -0.9310   0.3845   0.9640   2.0086  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -10.69721    0.68659 -15.580   <2e-16 ***
## residual.sugar  -0.03824    0.04216  -0.907    0.364    
## alcohol          1.05853    0.06679  15.848   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2209.0  on 1598  degrees of freedom
## Residual deviance: 1864.2  on 1596  degrees of freedom
## AIC: 1870.2
## 
## Number of Fisher Scoring iterations: 4

round(exp(coef(model.logit)),2)

##    (Intercept) residual.sugar        alcohol 
##           0.00           0.96           2.88

R can use functions withing the code for predicted probabilities to define changes in the independent variables.

# Range of Alcohol Content
print("Alcohol Content")

## [1] "Alcohol Content"

summary(dataset2$alcohol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.40    9.50   10.20   10.42   11.10   14.90

# High Alcohol
highalcohol.pp <- data.frame(residual.sugar=mean(dataset2$residual.sugar), alcohol=quantile(dataset2$alcohol, c(.75)))
highalcohol.pp.pred <- predict(model.logit, highalcohol.pp, type='response')

# Low Alcohol
lowalcohol.pp <- data.frame(residual.sugar=mean(dataset2$residual.sugar), alcohol=quantile(dataset2$alcohol, c(.25))) 
lowalcohol.pp.pred <- predict(model.logit, lowalcohol.pp, type='response')

# Predicted Probabilities
print("Probilities of Obtaining High Quality Wine")

## [1] "Probilities of Obtaining High Quality Wine"

paste("- Wine with high alcohol content: ", round(highalcohol.pp.pred,2))

## [1] "- Wine with high alcohol content:  0.72"

paste("- Wine with low alcohol content: ", round(lowalcohol.pp.pred,2))

## [1] "- Wine with low alcohol content:  0.32"

Train vs Test datasets

Here, I just want to demonstrate the benefits of being able to easily use multiple datasets at the same time.

### Train v Test ###
dataset2_random <- dataset2[sample(1:nrow(dataset2)), ]
train <- dataset2_random[1:799, ]
test <- dataset2_random[800:1599, ]
train.model <- glm(quality.dum~., family = binomial(link = "logit"), data = train)
summary(train.model)

## 
## Call:
## glm(formula = quality.dum ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5099  -0.9154   0.2686   0.9737   2.0735  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -11.11495    0.98938  -11.23   <2e-16 ***
## residual.sugar  -0.06534    0.07340   -0.89    0.373    
## alcohol          1.09604    0.09658   11.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1107.10  on 798  degrees of freedom
## Residual deviance:  926.47  on 796  degrees of freedom
## AIC: 932.47
## 
## Number of Fisher Scoring iterations: 4

library(ROCR)
pred.model <- predict.glm(train.model, test, type='response')
newpred <- prediction(pred.model, test$quality.dum)
newpred.performance <- performance(newpred, measure = "tpr",x.measure = "fpr")
plot(newpred.performance)
abline(a=0, b= 1)

AUC <- performance(newpred, measure = "auc")
AUC

## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.7534198
## 
## 
## Slot "alpha.values":
## list()

print("The train dataset correctly predicted 76% true postives of the test dataset")

## [1] "The train dataset correctly predicted 76% true postives of the test dataset"