Topic 1/2 Variables

Length=c(12, 3, 200, 30)
Width=c(18, 10, 50, NA)
Perimeter = 2*(Length + Width)

Topic 3 Data

Sorting

Subject = c("A101", "N342", "D527", "C339")
Sex = c("F", "M", "M", "F")
Age = c(13.1, 15.8, 12.7, 13.4)
Time = c(75, 67, 72, 74)
Color = c(NA,NA,NA,NA)
mtime = data.frame(Subject, Sex, Age, Time, Color)

#Sort by Time (Note need the comma)
mtime[order(Time),]

##   Subject Sex  Age Time Color
## 2    N342   M 15.8   67    NA
## 3    D527   M 12.7   72    NA
## 4    C339   F 13.4   74    NA
## 1    A101   F 13.1   75    NA

#Negative sign makes it descending
mtime[order(-Time),]

##   Subject Sex  Age Time Color
## 1    A101   F 13.1   75    NA
## 4    C339   F 13.4   74    NA
## 3    D527   M 12.7   72    NA
## 2    N342   M 15.8   67    NA

Combining Two Data Sets (Frames)

Subject = c("B324", "A712", "H308", "C119")
Sex = c("M", "M", "M", "F")
Age = c(12.5, 13.7, 13.1, 13.3)
Time = c(71, 73, 68, 75)
Color = c("Gray", "Brown", "White", "Gray")
MazeTime = data.frame(Subject, Sex, Age, Time, Color)
#Note, to combine I had to add a missing data Color to align variables
Combined = rbind(mtime, MazeTime); Combined

##   Subject Sex  Age Time Color
## 1    A101   F 13.1   75  <NA>
## 2    N342   M 15.8   67  <NA>
## 3    D527   M 12.7   72  <NA>
## 4    C339   F 13.4   74  <NA>
## 5    B324   M 12.5   71  Gray
## 6    A712   M 13.7   73 Brown
## 7    H308   M 13.1   68 White
## 8    C119   F 13.3   75  Gray

Merging Data Sets (Frames)

Subject = c("A101","N342", "D527", "C339")
Time2 = c(71, 68, 64, 67)
Retest = data.frame(Subject, Time2)
Merged = merge(mtime, Retest,by="Subject"); Merged

##   Subject Sex  Age Time Color Time2
## 1    A101   F 13.1   75    NA    71
## 2    C339   F 13.4   74    NA    67
## 3    D527   M 12.7   72    NA    64
## 4    N342   M 15.8   67    NA    68

#Note if there isn't a match it's deleted
Merged2 = merge(Combined, Retest,by="Subject"); Merged2

##   Subject Sex  Age Time Color Time2
## 1    A101   F 13.1   75  <NA>    71
## 2    C339   F 13.4   74  <NA>    67
## 3    D527   M 12.7   72  <NA>    64
## 4    N342   M 15.8   67  <NA>    68

Subsetting

MaleMice = subset(Combined, Sex=="M"); MaleMice

##   Subject Sex  Age Time Color
## 2    N342   M 15.8   67  <NA>
## 3    D527   M 12.7   72  <NA>
## 5    B324   M 12.5   71  Gray
## 6    A712   M 13.7   73 Brown
## 7    H308   M 13.1   68 White

Topic 4 Proc Univariate

Mean

machine = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1)
Deviation = c(-.0653, 0.0141, -.0702, -.0734, -.0649, -.0601, -.0631, -.0148, -.0731, -.0764, -.0275, -.0497, -.0741, -.0673, -.0573, -.0629, -.0671, -.0246, -.0222, -.0807, -.0621, -.0785, -.0544, -.0511, -.0138, -.0609, 0.0038, -.0758, -.0731, -.0455)
Parts = data.frame(machine, Deviation)
mean(Deviation)

## [1] -0.05306667

mean(subset(Parts, machine == 1)$Deviation)

## [1] -0.05213333

mean(subset(Parts, machine == 2)$Deviation)

## [1] -0.05446667

Other univariate measures

library(moments)
sd(Deviation)

## [1] 0.02543624

skewness(Deviation)

## [1] 1.192545

kurtosis(Deviation)

## [1] 3.393343

Histogram

hist(Deviation)
lines(density(Deviation))

hist(Deviation, breaks = seq(from = -0.09, to = 0.03, by = 0.01))

QQPlot

qqnorm(Deviation)

library(car)

## Loading required package: carData

qqPlot(Deviation)

## [1]  2 27

Boxplots

boxplot(Deviation)

boxplot(Deviation~machine)

Topic 5 Frequency Tables and Graphs

Frequency Tables

Data=read.table("TeacherSalary.txt")
names(Data) <- c("ID", "Gender", "Degree", "Years", "Salary")
attach(Data)
DegreeSummary <- table(Degree); DegreeSummary

## Degree
##  BS  MS PHD 
##  10   6   3

#Or use factor
factor(Degree)

##  [1] BS  BS  BS  MS  BS  BS  MS  MS  BS  BS  PHD MS  PHD MS  BS  BS  MS 
## [18] BS  PHD
## Levels: BS MS PHD

table(subset(Data, Gender == "M")$Degree)

## 
##  BS  MS PHD 
##   6   4   2

table(subset(Data, Gender == "F")$Degree)

## 
##  BS  MS PHD 
##   4   2   1

2-Way Frequency Table

mytable = table(Gender,Degree) # A will be rows, B will be columns 
mytable

##       Degree
## Gender BS MS PHD
##      F  4  2   1
##      M  6  4   2

margin.table(mytable, 1) # A frequencies (summed over B)

## Gender
##  F  M 
##  7 12

margin.table(mytable, 2) # B frequencies (summed over A)

## Degree
##  BS  MS PHD 
##  10   6   3

prop.table(mytable) # cell percentages

##       Degree
## Gender         BS         MS        PHD
##      F 0.21052632 0.10526316 0.05263158
##      M 0.31578947 0.21052632 0.10526316

prop.table(mytable, 1) # row percentages

##       Degree
## Gender        BS        MS       PHD
##      F 0.5714286 0.2857143 0.1428571
##      M 0.5000000 0.3333333 0.1666667

prop.table(mytable, 2) # column percentages

##       Degree
## Gender        BS        MS       PHD
##      F 0.4000000 0.3333333 0.3333333
##      M 0.6000000 0.6666667 0.6666667

Weighted Tables

NEEDS WORK :(((

gender=c("F", "F", "F", "M", "M", "M")
degree = c("BS", "MS", "PHD", "BS", "MS", "PHD")
freq = c(4, 2, 1, 6, 4, 2)
degreebygender=data.frame("gender", "degree", "freq")

#df <- data.frame(var = gender, wt = freq)

library(questionr)
wtd.table(x = degreebygender$degree, weights = degreebygender$freq)

## < table of extent 0 >

Bar Graphs

barplot(table(Degree)) #Note factor doesn't work

barplot(table(Degree), horiz=TRUE)

barplot(table(Years))

barplot(table(Gender,Salary))

barplot(table(Salary,Gender), beside = TRUE)

Pie Charts

pie(Years)

Scatterplots

plot(Salary, Years)

library(car) 
scatterplot(Salary ~ Years)

## Warning in Ops.factor(x[floor(d)], x[ceiling(d)]): '+' not meaningful for
## factors

## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth

## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

Topic 6 Hypothesis Testing

1 sample Hypothesis Test

Hours=c(13, 8, 18, 11, 11, 10, 15, 9, 17, 14)
summary(Hours)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   10.25   12.00   12.60   14.75   18.00

mean(Hours)

## [1] 12.6

sd(Hours)

## [1] 3.373096

t.test(Hours,mu=11.8, conf.level = .98) # Ho: mu=11.8

## 
##  One Sample t-test
## 
## data:  Hours
## t = 0.75, df = 9, p-value = 0.4724
## alternative hypothesis: true mean is not equal to 11.8
## 98 percent confidence interval:
##   9.590466 15.609534
## sample estimates:
## mean of x 
##      12.6

help(t.test)

#One Sided
t.test(Hours,mu=11.8, alternative = "greater")

## 
##  One Sample t-test
## 
## data:  Hours
## t = 0.75, df = 9, p-value = 0.2362
## alternative hypothesis: true mean is greater than 11.8
## 95 percent confidence interval:
##  10.64468      Inf
## sample estimates:
## mean of x 
##      12.6

t.test(Hours,mu=11.8, alternative = "less")

## 
##  One Sample t-test
## 
## data:  Hours
## t = 0.75, df = 9, p-value = 0.7638
## alternative hypothesis: true mean is less than 11.8
## 95 percent confidence interval:
##      -Inf 14.55532
## sample estimates:
## mean of x 
##      12.6

2-sample

T25 = c(49, 34, 24, 32, 52, 14, 28, 18, 28, 47, 60)
T35 = c(28, 55, 45, 51, 41, 27, 44, 48, 54, 67, 46, 59)
Boxplot(T25)

Boxplot(T35)

var.test(T25,T35) #Since not significant difference in variance we can use pooled.

## 
##  F test to compare two variances
## 
## data:  T25 and T35
## F = 1.6393, num df = 10, denom df = 11, p-value = 0.4294
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4649565 6.0078345
## sample estimates:
## ratio of variances 
##           1.639284

t.test(T25,T35, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  T25 and T35
## t = -2.165, df = 21, p-value = 0.04205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -23.5116483  -0.4732002
## sample estimates:
## mean of x mean of y 
##  35.09091  47.08333

Unpooled (Default)

t.test(T25,T35)

## 
##  Welch Two Sample t-test
## 
## data:  T25 and T35
## t = -2.1413, df = 18.93, p-value = 0.04548
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -23.7175653  -0.2672832
## sample estimates:
## mean of x mean of y 
##  35.09091  47.08333

Paired T-Test

Baseline=c(6.37, 5.69, 5.58, 5.27, 5.11, 4.89, 4.7, 3.53)
Caffeine = c( 4.52, 5.44, 4.7, 3.81, 4.06, 3.22, 2.96, 3.2)
t.test(Baseline, Caffeine, paired = TRUE)

## 
##  Paired t-test
## 
## data:  Baseline and Caffeine
## t = 5.1878, df = 7, p-value = 0.00127
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.6278643 1.6796357
## sample estimates:
## mean of the differences 
##                 1.15375

Non Parametric Tests, PAIRED Tests

library(BSDA)

## Loading required package: lattice

## 
## Attaching package: 'BSDA'

## The following objects are masked from 'Data':
## 
##     Degree, Salary

## The following objects are masked from 'package:carData':
## 
##     Vocab, Wool

## The following object is masked from 'package:datasets':
## 
##     Orange

A=c( 276.3, 288.4, 280.2, 294.7, 268.7, 282.8, 276.1, 179)
B=c( 275.1, 286.8, 277.3, 290.6, 269.1, 281, 275.3, 179.1)
SIGN.test(A,B)

## 
##  Dependent-samples Sign-Test
## 
## data:  A and B
## S = 6, p-value = 0.2891
## alternative hypothesis: true median difference is not equal to 0
## 95 percent confidence interval:
##  -0.1975  3.2900
## sample estimates:
## median of x-y 
##           1.4 
## 
## Achieved and Interpolated Confidence Intervals: 
## 
##                   Conf.Level  L.E.pt U.E.pt
## Lower Achieved CI     0.9297 -0.1000   2.90
## Interpolated CI       0.9500 -0.1975   3.29
## Upper Achieved CI     0.9922 -0.4000   4.10

wilcox.test(A, B, paired = TRUE) #Wilcoxon Signed Rank Test

## 
##  Wilcoxon signed rank test
## 
## data:  A and B
## V = 33, p-value = 0.03906
## alternative hypothesis: true location shift is not equal to 0

2 Independent Samples

wilcox.test(A, B, paired = FALSE) #Wilcoxon Sum Rank Test

## 
##  Wilcoxon rank sum test
## 
## data:  A and B
## W = 35, p-value = 0.7984
## alternative hypothesis: true location shift is not equal to 0

Topic 7 Linear Regression

Pearson Correlation Coefficient r

year = seq.int(from = 1996, to = 2007)
csco = c(0.704, 0.314, 1.50, 1.31, -0.286, -0.527, -0.277, 0.85, -0.203, 0.029, 0.434, 0.044)
wd = c(0.204, 0.448, -0.080, -0.015, -0.004, -0.277, -0.203, 0.444, 0.202, -0.129, 0.443, -0.043)
ge = c(0.565, 0.587, 0.451, 0.574, -0.055, -0.151, -0.377, 0.308, 0.207, -0.014, 0.093, 0.126)
xom = c(0.405, 0.342, 0.254, 0.151, 0.127, -0.066, -0.089, 0.206, 0.281, 0.118, 0.391, 0.243)
teco = c(-0.012, 0.223, 0.050, -0.303, 0.849, -0.150, -0.369, 0.004, 0.128, 0.170, 0.051, 0.058)
stockreturn = data.frame(year, csco, wd, ge, xom, teco)

help(cor)
cor(stockreturn)

##             year       csco          wd          ge        xom        teco
## year  1.00000000 -0.3935922 -0.02930211 -0.54302353 -0.1131893 -0.02207881
## csco -0.39359224  1.0000000  0.29563711  0.76624782  0.4635722 -0.24593499
## wd   -0.02930211  0.2956371  1.00000000  0.53453465  0.7716692  0.20512889
## ge   -0.54302353  0.7662478  0.53453465  1.00000000  0.7263548 -0.00905595
## xom  -0.11318934  0.4635722  0.77166915  0.72635475  1.0000000  0.26481697
## teco -0.02207881 -0.2459350  0.20512889 -0.00905595  0.2648170  1.00000000

plot(stockreturn)

cor.test(csco, wd) #I couldn't figure out how to do more than 2 at a time

## 
##  Pearson's product-moment correlation
## 
## data:  csco and wd
## t = 0.97863, df = 10, p-value = 0.3508
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3351238  0.7434073
## sample estimates:
##       cor 
## 0.2956371

library("ggpubr")

## Loading required package: ggplot2

## Loading required package: magrittr

ggscatter(stockreturn, x = "csco", y = "wd", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Cisco Stock", ylab = "Walt Disney Stock")

#Interesting but not great 
library(corrgram)

## 
## Attaching package: 'corrgram'

## The following object is masked from 'package:lattice':
## 
##     panel.fill

corrgram(stockreturn)

Linear Regression

dist = c(3.4, 1.8, 4.6, 2.3, 3.1, 5.5, 0.7, 3.0, 2.6, 4.3, 2.1, 1.1, 6.4, 4.8, 3.8, 4.2)
damage = c(26.2, 17.8, 31.3, 23.1, 27.5, 36.0, 14.1, 22.3, 19.6, 31.3, 24.0, 17.3, 43.2, 36.4, 26.1, 28.8)
firedamage = data.frame(dist, damage)
lm = lm(damage~dist)
summary(lm)

## 
## Call:
## lm(formula = damage ~ dist)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3521 -1.4473 -0.1764  1.7117  3.4349 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.5397     1.3135   8.024 1.32e-06 ***
## dist          4.7740     0.3562  13.402 2.24e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.176 on 14 degrees of freedom
## Multiple R-squared:  0.9277, Adjusted R-squared:  0.9225 
## F-statistic: 179.6 on 1 and 14 DF,  p-value: 2.237e-09

plot(lm) #diagnostics

leveragePlots(lm)

ggscatter(firedamage, x = "dist", y = "damage", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Distance From Firehouse (miles)", ylab = "Damage ($1000)")

# Confidence Inverval for mean response
Newdist = data.frame(dist = 5)
predict(lm, Newdist, interval = "confidence")

##        fit      lwr      upr
## 1 34.40981 32.69554 36.12408

# Prediction Inverval for mean response
predict(lm, Newdist, interval = "predict")

##        fit      lwr      upr
## 1 34.40981 29.43758 39.38204

Topic 7B Non-linear

Bad Linear Fit

Yr = c(1913, 1923, 1929, 1933, 1943, 1953,1963, 1973, 1983, 1993, 1999, 2003, 2007, 2008, 2013)
Dow = c(1886, 1252, 4556, 1694, 1770, 2367, 5761, 4161, 2715, 5727, 14690, 12018, 14315, 9421, 16123)
djia = data.frame(Yr, Dow)
fit = lm(Dow~Yr)
plot(fit)

ggscatter(djia, x = "Yr", y = "Dow", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Year", ylab = "Dow")

Quadratic Model

Yr2= Yr^2
quad = lm(Dow~Yr+Yr2)
summary(quad)

## 
## Call:
## lm(formula = Dow ~ Yr + Yr2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3742.8 -1086.8   -51.5  1519.8  4351.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  8.491e+06  3.094e+06   2.745   0.0178 *
## Yr          -8.764e+03  3.149e+03  -2.783   0.0166 *
## Yr2          2.262e+00  8.012e-01   2.823   0.0154 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2517 on 12 degrees of freedom
## Multiple R-squared:  0.8079, Adjusted R-squared:  0.7759 
## F-statistic: 25.24 on 2 and 12 DF,  p-value: 5.021e-05

plot(Dow~Yr)
lines(smooth.spline(Yr, predict(quad)))

Cubic Model

Yr3= Yr^3
cubic = lm(Dow~Yr+Yr2+Yr3)
summary(cubic)

## 
## Call:
## lm(formula = Dow ~ Yr + Yr2 + Yr3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3931.3  -970.7    63.6  1091.9  4914.4 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.329e+08  2.261e+08  -1.030    0.325
## Yr           3.602e+05  3.455e+05   1.043    0.319
## Yr2         -1.857e+02  1.760e+02  -1.055    0.314
## Yr3          3.191e-02  2.987e-02   1.068    0.308
## 
## Residual standard error: 2502 on 11 degrees of freedom
## Multiple R-squared:  0.826,  Adjusted R-squared:  0.7785 
## F-statistic:  17.4 on 3 and 11 DF,  p-value: 0.0001735

plot(Dow~Yr)
lines(smooth.spline(Yr, predict(cubic)))

Exponential Transformation

lny=log(Dow)
logfit= lm(lny~Yr)
plot(lny~Yr)
lines(smooth.spline(Yr, predict(logfit)))

Untransform

yhat=exp(predict(logfit))
plot(Dow~Yr)
lines(smooth.spline(Yr, yhat))
points(Yr, yhat,col=3)

#help(points)

Macros

I could not figure this out. I’m a failure :(

Box Cox

boxCox(lm)

Topic 8

Non linear regression

You’re welcome, because figuring this out was a pain in the ASS!

library(nlmrt)
 x = c(4,6, 8, 10, 12, 14, 16, 18, 20)
 y=c(0.2,0.7,1.4,2.8,5.8,11.1,21.7,43.2,80.1)
data=data.frame(x,y)
model = wrapnls(formula = y~ a*exp(b*x), data = data, start = c(a = 0.04, b=0.4))
#help(wrapnls)

Topic 9 Categorical Data Formatting

Reading a string of numbers into variables (survey 1: 181143)

#Reading this in and separating it was POOP! UGH
d=read.table("DecodingData.txt")
df = cbind(read.fwf(file = textConnection(as.character(d[, 1])), widths = c(2, 1, 1, 1, 1), colClasses = "character", col.names = c("a", "s", "ys", "cf", "p")))

#This separates the data into a matrix, not ideal
df2 <- cbind(a=substr(d[, 1],1,2), s=substr(d[, 1],3,3), ys=substr(d[, 1],4,4), cf=substr(d[, 1],5,5), p=substr(d[, 1],6,6))

Renaming Variables

# df = dataframe
# old.var.name = The name you don't like anymore
# new.var.name = The name you want to get
names(df)[names(df) == 'old.var.name'] <- 'new.var.name'
names(df)[names(df) == 'a'] <- 'Age'
names(df)[names(df) == 's'] <- 'Sex'
names(df)[names(df) == 'ys'] <- 'Years in College'
df

##    Age Sex Years in College cf    p
## 1   18   1                1  4    3
## 2   22   1                4  2    3
## 3   19   2                1  2    4
## 4   20   2                2  1    2
## 5   24   1                5  1    4
## 6   23   2                3  3    5
## 7   17   2                1  2    .
## 8   26   2                3  2    4
## 9   19   .                2  3    4
## 10  20   2                2  4    2
## 11  21   2                3  2    2
## 12  28   1                5  .    .
## 13  18   2                1  3    1
## 14  20   2                2  1    4
## 15  22   1                4  .    3
## 16  23   2                4  2    3
## 17  20   1                1  3    5
## 18  20   1                4  2    4
## 19  27   2                5  3    4
## 20  21   2                4  1    2
## 21  18   2                1  2    3
## 22  18   2                1  3    3
## 23  22   2                4  2    4
## 24  20   1                2  4    5
## 25  25   2                1  3    3
## 26  19   1                1  4    3
## 27  21   3                1  . <NA>
## 28  19   2                2  2    3
## 29  22   2                5  2    2
## 30  21   1                4  4    4
## 31  19   2                2  3    1

Labels

df$Sex <- ordered(df$Sex, levels = c("1","2"), labels = c("Male", "Female"))
df$`Years in College` <- ordered(df$`Years in College`, levels = c("1","2","3","4","5"), labels = c("Freshman", "Sophomore", "Junior", "Senior", "Grad Student"))
df$cf <- ordered(df$cf, levels = c("1","2","3","4","5"), labels = c("Poor","Fair","Adequate","Good","Excellent"))
df$p <- ordered(df$p, levels = c("1","2","3","4","5"), labels = c("Poor","Fair","Adequate","Good","Excellent"))
df

##    Age    Sex Years in College       cf         p
## 1   18   Male         Freshman     Good  Adequate
## 2   22   Male           Senior     Fair  Adequate
## 3   19 Female         Freshman     Fair      Good
## 4   20 Female        Sophomore     Poor      Fair
## 5   24   Male     Grad Student     Poor      Good
## 6   23 Female           Junior Adequate Excellent
## 7   17 Female         Freshman     Fair      <NA>
## 8   26 Female           Junior     Fair      Good
## 9   19   <NA>        Sophomore Adequate      Good
## 10  20 Female        Sophomore     Good      Fair
## 11  21 Female           Junior     Fair      Fair
## 12  28   Male     Grad Student     <NA>      <NA>
## 13  18 Female         Freshman Adequate      Poor
## 14  20 Female        Sophomore     Poor      Good
## 15  22   Male           Senior     <NA>  Adequate
## 16  23 Female           Senior     Fair  Adequate
## 17  20   Male         Freshman Adequate Excellent
## 18  20   Male           Senior     Fair      Good
## 19  27 Female     Grad Student Adequate      Good
## 20  21 Female           Senior     Poor      Fair
## 21  18 Female         Freshman     Fair  Adequate
## 22  18 Female         Freshman Adequate  Adequate
## 23  22 Female           Senior     Fair      Good
## 24  20   Male        Sophomore     Good Excellent
## 25  25 Female         Freshman Adequate  Adequate
## 26  19   Male         Freshman     Good  Adequate
## 27  21   <NA>         Freshman     <NA>      <NA>
## 28  19 Female        Sophomore     Fair  Adequate
## 29  22 Female     Grad Student     Fair      Fair
## 30  21   Male           Senior     Good      Good
## 31  19 Female        Sophomore Adequate      Poor

Creating a new variable using ifelse (another proud moment of my perseverance)

#Pro Tip: & is logical "AND", while | is logical "or"- vertical bar below delete key
Status = ifelse((df$`Years in College` == "Freshman") | (df$`Years in College` == "Sophomore"), "Underclassman", ifelse((df$`Years in College` == "Junior") | (df$`Years in College` == "Senior"), "Upperclassman", "Graduate"))

dfnew=data.frame(df,Status)
dfnew

##    Age    Sex Years.in.College       cf         p        Status
## 1   18   Male         Freshman     Good  Adequate Underclassman
## 2   22   Male           Senior     Fair  Adequate Upperclassman
## 3   19 Female         Freshman     Fair      Good Underclassman
## 4   20 Female        Sophomore     Poor      Fair Underclassman
## 5   24   Male     Grad Student     Poor      Good      Graduate
## 6   23 Female           Junior Adequate Excellent Upperclassman
## 7   17 Female         Freshman     Fair      <NA> Underclassman
## 8   26 Female           Junior     Fair      Good Upperclassman
## 9   19   <NA>        Sophomore Adequate      Good Underclassman
## 10  20 Female        Sophomore     Good      Fair Underclassman
## 11  21 Female           Junior     Fair      Fair Upperclassman
## 12  28   Male     Grad Student     <NA>      <NA>      Graduate
## 13  18 Female         Freshman Adequate      Poor Underclassman
## 14  20 Female        Sophomore     Poor      Good Underclassman
## 15  22   Male           Senior     <NA>  Adequate Upperclassman
## 16  23 Female           Senior     Fair  Adequate Upperclassman
## 17  20   Male         Freshman Adequate Excellent Underclassman
## 18  20   Male           Senior     Fair      Good Upperclassman
## 19  27 Female     Grad Student Adequate      Good      Graduate
## 20  21 Female           Senior     Poor      Fair Upperclassman
## 21  18 Female         Freshman     Fair  Adequate Underclassman
## 22  18 Female         Freshman Adequate  Adequate Underclassman
## 23  22 Female           Senior     Fair      Good Upperclassman
## 24  20   Male        Sophomore     Good Excellent Underclassman
## 25  25 Female         Freshman Adequate  Adequate Underclassman
## 26  19   Male         Freshman     Good  Adequate Underclassman
## 27  21   <NA>         Freshman     <NA>      <NA> Underclassman
## 28  19 Female        Sophomore     Fair  Adequate Underclassman
## 29  22 Female     Grad Student     Fair      Fair      Graduate
## 30  21   Male           Senior     Good      Good Upperclassman
## 31  19 Female        Sophomore Adequate      Poor Underclassman

Breaking a numeric variable into categories (a cute little technique, “ifelse” should work too)

agegroup = cut(as.numeric(df$Age), breaks = c(0, 16, 19, 21, 23, 27, max(df$Age)), labels = c("Out of Range", "17-19", "20-21", "22-23", "24-27", "28 and up"), right = TRUE)

dfnew2 = data.frame(dfnew, agegroup); dfnew2

##    Age    Sex Years.in.College       cf         p        Status  agegroup
## 1   18   Male         Freshman     Good  Adequate Underclassman     17-19
## 2   22   Male           Senior     Fair  Adequate Upperclassman     22-23
## 3   19 Female         Freshman     Fair      Good Underclassman     17-19
## 4   20 Female        Sophomore     Poor      Fair Underclassman     20-21
## 5   24   Male     Grad Student     Poor      Good      Graduate     24-27
## 6   23 Female           Junior Adequate Excellent Upperclassman     22-23
## 7   17 Female         Freshman     Fair      <NA> Underclassman     17-19
## 8   26 Female           Junior     Fair      Good Upperclassman     24-27
## 9   19   <NA>        Sophomore Adequate      Good Underclassman     17-19
## 10  20 Female        Sophomore     Good      Fair Underclassman     20-21
## 11  21 Female           Junior     Fair      Fair Upperclassman     20-21
## 12  28   Male     Grad Student     <NA>      <NA>      Graduate 28 and up
## 13  18 Female         Freshman Adequate      Poor Underclassman     17-19
## 14  20 Female        Sophomore     Poor      Good Underclassman     20-21
## 15  22   Male           Senior     <NA>  Adequate Upperclassman     22-23
## 16  23 Female           Senior     Fair  Adequate Upperclassman     22-23
## 17  20   Male         Freshman Adequate Excellent Underclassman     20-21
## 18  20   Male           Senior     Fair      Good Upperclassman     20-21
## 19  27 Female     Grad Student Adequate      Good      Graduate     24-27
## 20  21 Female           Senior     Poor      Fair Upperclassman     20-21
## 21  18 Female         Freshman     Fair  Adequate Underclassman     17-19
## 22  18 Female         Freshman Adequate  Adequate Underclassman     17-19
## 23  22 Female           Senior     Fair      Good Upperclassman     22-23
## 24  20   Male        Sophomore     Good Excellent Underclassman     20-21
## 25  25 Female         Freshman Adequate  Adequate Underclassman     24-27
## 26  19   Male         Freshman     Good  Adequate Underclassman     17-19
## 27  21   <NA>         Freshman     <NA>      <NA> Underclassman     20-21
## 28  19 Female        Sophomore     Fair  Adequate Underclassman     17-19
## 29  22 Female     Grad Student     Fair      Fair      Graduate     22-23
## 30  21   Male           Senior     Good      Good Upperclassman     20-21
## 31  19 Female        Sophomore Adequate      Poor Underclassman     17-19

Topic 10 Contingincy Tables

Goodness of Fit

plants = c(54, 122, 58)
gof=chisq.test(plants, p=c(1/4, 1/2, 1/4));gof #1:2:1 Expected ratio

## 
##  Chi-squared test for given probabilities
## 
## data:  plants
## X-squared = 0.5641, df = 2, p-value = 0.7542

gof=chisq.test(plants, p=c(.25, .50, .25)) #probabilities need to sum to 1
gof$expected

## [1]  58.5 117.0  58.5

#note that the table feature from before would be fine instead of Matrix

Chi Squared Test of Association (and Homogeneity)

#Input data as a Matrix
matrix=matrix(c(24, 289, 9, 100, 13, 565), nrow = 2)
colnames(matrix)=c("No Vacine", "One Shot", "Two Shot")
rownames(matrix)=c("Flu", "No Flu")
matrix

##        No Vacine One Shot Two Shot
## Flu           24        9       13
## No Flu       289      100      565

cstest=chisq.test(matrix);cstest

## 
##  Pearson's Chi-squared test
## 
## data:  matrix
## X-squared = 17.313, df = 2, p-value = 0.000174

cstest$expected

##        No Vacine One Shot Two Shot
## Flu       14.398    5.014   26.588
## No Flu   298.602  103.986  551.412

cstest$residuals

##         No Vacine   One Shot  Two Shot
## Flu     2.5305249  1.7801030 -2.635195
## No Flu -0.5556679 -0.3908858  0.578652

#If expected values are not >= 5, use Fisher's Exact Test:
fisher.test(matrix)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix
## p-value = 0.0001072
## alternative hypothesis: two.sided

Relative Risk

M=matrix(c(237, 197, 3489, 5870), nrow=2)
colnames(M)=c("Low", "Normal")
rownames(M)=c("Smoker", "Non Smoker")
M

##            Low Normal
## Smoker     237   3489
## Non Smoker 197   5870

barplot(M)

barplot(M, beside = TRUE)

library(epiR)

## Loading required package: survival

## Package epiR 0.9-99 is loaded

## Type help(epi.about) for summary information

##

epi.2by2(M)

##              Outcome +    Outcome -      Total        Inc risk *
## Exposed +          237         3489       3726              6.36
## Exposed -          197         5870       6067              3.25
## Total              434         9359       9793              4.43
##                  Odds
## Exposed +      0.0679
## Exposed -      0.0336
## Total          0.0464
## 
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio                               1.96 (1.63, 2.36)
## Odds ratio                                   2.02 (1.67, 2.46)
## Attrib risk *                                3.11 (2.21, 4.02)
## Attrib risk in population *                  1.18 (0.58, 1.79)
## Attrib fraction in exposed (%)               48.95 (38.61, 57.55)
## Attrib fraction in population (%)            26.73 (18.95, 33.77)
## -------------------------------------------------------------------
##  X2 test statistic: 52.838 p-value: < 0.001
##  Wald confidence limits
##  * Outcomes per 100 population units

McNemar’s Test (When not Independent)

MCMData=matrix(c(26, 7, 15, 37), nrow=2)
colnames(MCMData)=c("Sibling (Control) yes", "no" )
rownames(MCMData)=c("Hodgekin Patient yes", "No")
MCMData

##                      Sibling (Control) yes no
## Hodgekin Patient yes                    26 15
## No                                       7 37

mcnemar.test(MCMData)

## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  MCMData
## McNemar's chi-squared = 2.2273, df = 1, p-value = 0.1356

Kappa Test of Stength of Agreement

Kappa=matrix(c(9, 3, 6, 12), nrow=2)
colnames(Kappa)=c("yes", "no" )
rownames(Kappa)=c("yes", "No")
Kappa

##     yes no
## yes   9  6
## No    3 12

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following object is masked from 'package:questionr':
## 
##     describe

## The following object is masked from 'package:car':
## 
##     logit

cohen.kappa(Kappa)

## Call: cohen.kappa1(x = x, w = w, n.obs = n.obs, alpha = alpha, levels = levels)
## 
## Cohen Kappa and Weighted Kappa correlation coefficients and confidence boundaries 
##                  lower estimate upper
## unweighted kappa 0.079      0.4  0.72
## weighted kappa   0.079      0.4  0.72
## 
##  Number of subjects = 30

Topic 11 Formatting Data

Dates

date()

## [1] "Sat Feb  9 14:20:39 2019"

t = Sys.Date();t #Today

## [1] "2019-02-09"

format(t, format="%B %d")

## [1] "February 09"

# convert date info in format 'mm/dd/yyyy'
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y");dates

## [1] "1965-01-05" "1975-08-16"

Do/for Loops

invest=20000
rate = 0.07

for (i in 1:5) {
  invest=invest*(1+rate)
  print(invest)
}

## [1] 21400
## [1] 22898
## [1] 24500.86
## [1] 26215.92
## [1] 28051.03

Topic 12

ANOVA (1-way and Kruskal-Wallis)

scores = c( 101,118,98,107,116, 151,149,121,112,127,138, 119,109,198,186,160,134)
group = c("A","A","A","A","A","B","B","B","B","B","B","C","C","C","C","C","C")
videogame=data.frame(scores,group);videogame

##    scores group
## 1     101     A
## 2     118     A
## 3      98     A
## 4     107     A
## 5     116     A
## 6     151     B
## 7     149     B
## 8     121     B
## 9     112     B
## 10    127     B
## 11    138     B
## 12    119     C
## 13    109     C
## 14    198     C
## 15    186     C
## 16    160     C
## 17    134     C

library("ggpubr")
ggboxplot(videogame, x = "group", y = "scores")

ggboxplot(videogame, x = "group", y = "scores", color = "group", order = c("A", "B", "C"), ylab = "Scores", xlab = "group")

ggline(videogame, x = "group", y = "scores", add = c("mean_se", "jitter"))

anova <- aov(scores ~ group, data = videogame)
# Summary of the analysis
summary(anova)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## group        2   5052  2526.0   4.349  0.034 *
## Residuals   14   8132   580.9                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Tukey Post hoc Test
TukeyHSD(anova)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = scores ~ group, data = videogame)
## 
## $group
##     diff        lwr      upr     p adj
## B-A   25 -13.196243 63.19624 0.2349865
## C-A   43   4.803757 81.19624 0.0269286
## C-B   18 -18.418689 54.41869 0.4215527

#If Anova test assumptions are not met, non-paramtric Kruskal_Wallis Test:
kruskal.test(scores ~ group, data = videogame)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  scores by group
## Kruskal-Wallis chi-squared = 7.7203, df = 2, p-value = 0.02107

N-Way / Factorial ANOVA

#So I learned that I don't care about loops for data entry..... I am assuming that there's an easier way to do this, but I don't care to learn atm
alz = c(21, 24, 22, 18, 20, 9, 7, 9, 8, 12, 12, 15, 18, 20, 19, 15, 8, 12, 13, 11)
stage = c("Early","Early","Early","Early","Early","Early","Early","Early","Early","Early","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle")
music=c("Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart","Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart")
alzdf=data.frame(alz,stage,music); alzdf

##    alz  stage  music
## 1   21  Early  Piano
## 2   24  Early  Piano
## 3   22  Early  Piano
## 4   18  Early  Piano
## 5   20  Early  Piano
## 6    9  Early Mozart
## 7    7  Early Mozart
## 8    9  Early Mozart
## 9    8  Early Mozart
## 10  12  Early Mozart
## 11  12 Middle  Piano
## 12  15 Middle  Piano
## 13  18 Middle  Piano
## 14  20 Middle  Piano
## 15  19 Middle  Piano
## 16  15 Middle Mozart
## 17   8 Middle Mozart
## 18  12 Middle Mozart
## 19  13 Middle Mozart
## 20  11 Middle Mozart

library("ggpubr")
ggboxplot(alzdf, x = "stage", y = "alz", color = "music")

ggboxplot(alzdf, x = "music", y = "alz", color = "stage")

library("ggpubr")
ggline(alzdf, x = "stage", y = "alz", color = "music",add = c("mean_se", "dotplot"))

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

ggline(alzdf, x = "music", y = "alz", color = "stage",add = c("mean_se", "dotplot"))

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#Without Interaction
anova2 <- aov(alz ~ stage + music, data = alzdf)
summary(anova2)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## stage        1    2.5     2.5   0.253    0.622    
## music        1  361.3   361.3  37.254 1.17e-05 ***
## Residuals   17  164.9     9.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#With Interaction

anova3 <- aov(alz ~ stage + music +stage:music, data = alzdf)
summary(anova3)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## stage        1    2.5     2.5   0.378  0.54712    
## music        1  361.3   361.3  55.792 1.34e-06 ***
## stage:music  1   61.3    61.3   9.459  0.00724 ** 
## Residuals   16  103.6     6.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Tukey 
TukeyHSD(anova3, which = "stage")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
## 
## $stage
##              diff       lwr      upr     p adj
## Middle-Early -0.7 -3.112411 1.712411 0.5471223

TukeyHSD(anova3, which = "music")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
## 
## $music
##              diff      lwr      upr   p adj
## Piano-Mozart  8.5 6.087589 10.91241 1.3e-06

Same with 3rd level of music “Easy Listening”

alz = c(21, 24, 22, 18, 20, 9, 7, 9, 8, 12, 12, 15, 18, 20, 19, 15, 8, 12, 13, 11,29,26,30,24,26,17,19,21,18,16)
stage = c("Early","Early","Early","Early","Early","Early","Early","Early","Early","Early","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Early","Early","Early","Early","Early","Middle","Middle","Middle","Middle","Middle")
music=c("Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart","Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening")
alzdf=data.frame(alz,stage,music); alzdf

##    alz  stage          music
## 1   21  Early          Piano
## 2   24  Early          Piano
## 3   22  Early          Piano
## 4   18  Early          Piano
## 5   20  Early          Piano
## 6    9  Early         Mozart
## 7    7  Early         Mozart
## 8    9  Early         Mozart
## 9    8  Early         Mozart
## 10  12  Early         Mozart
## 11  12 Middle          Piano
## 12  15 Middle          Piano
## 13  18 Middle          Piano
## 14  20 Middle          Piano
## 15  19 Middle          Piano
## 16  15 Middle         Mozart
## 17   8 Middle         Mozart
## 18  12 Middle         Mozart
## 19  13 Middle         Mozart
## 20  11 Middle         Mozart
## 21  29  Early Easy Listening
## 22  26  Early Easy Listening
## 23  30  Early Easy Listening
## 24  24  Early Easy Listening
## 25  26  Early Easy Listening
## 26  17 Middle Easy Listening
## 27  19 Middle Easy Listening
## 28  21 Middle Easy Listening
## 29  18 Middle Easy Listening
## 30  16 Middle Easy Listening

library("ggpubr")
ggboxplot(alzdf, x = "stage", y = "alz", color = "music")

ggboxplot(alzdf, x = "music", y = "alz", color = "stage")

library("ggpubr")
ggline(alzdf, x = "stage", y = "alz", color = "music",add = c("mean_se", "dotplot"))

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

ggline(alzdf, x = "music", y = "alz", color = "stage",add = c("mean_se", "dotplot"))

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#Without Interaction
anova2 <- aov(alz ~ stage + music, data = alzdf)
summary(anova2)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## stage        1   86.7    86.7   7.202   0.0125 *  
## music        2  782.6   391.3  32.504 8.45e-08 ***
## Residuals   26  313.0    12.0                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#With Interaction

anova3 <- aov(alz ~ stage + music +stage:music, data = alzdf)
summary(anova3)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## stage        1   86.7    86.7   14.61 0.000824 ***
## music        2  782.6   391.3   65.95 1.77e-10 ***
## stage:music  2  170.6    85.3   14.38 7.86e-05 ***
## Residuals   24  142.4     5.9                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Tukey 
TukeyHSD(anova3, which = "stage")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
## 
## $stage
##              diff       lwr       upr     p adj
## Middle-Early -3.4 -5.235723 -1.564277 0.0008239

TukeyHSD(anova3, which = "music")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
## 
## $music
##                        diff        lwr        upr     p adj
## Mozart-Easy Listening -12.2 -14.920401 -9.4795995 0.0000000
## Piano-Easy Listening   -3.7  -6.420401 -0.9795995 0.0064787
## Piano-Mozart            8.5   5.779599 11.2204005 0.0000001

Unbalanced ANOVA

library(car)
tIII.ANOVA <- aov(alz ~ stage + music +stage:music, data = alzdf)
Anova(tIII.ANOVA, type = "III")

## Anova Table (Type III tests)
## 
## Response: alz
##             Sum Sq Df F value    Pr(>F)    
## (Intercept) 3645.0  1 614.326 < 2.2e-16 ***
## stage        193.6  1  32.629 6.942e-06 ***
## music        840.0  2  70.787 8.603e-11 ***
## stage:music  170.6  2  14.376 7.863e-05 ***
## Residuals    142.4 24                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Topic 13 Multiple Regression

house=read.csv("HouseSales.csv")
attach(house)

## The following object is masked from package:BSDA:
## 
##     Heating

lm1=lm(SalePrice~GrLivArea + BedroomAbvGr, house)
summary(lm1)

## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + BedroomAbvGr, data = house)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -549234  -27252    -349   23139  298053 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   62686.434   5336.730   11.75   <2e-16 ***
## GrLivArea       128.899      3.087   41.76   <2e-16 ***
## BedroomAbvGr -26899.823   1988.164  -13.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52870 on 1457 degrees of freedom
## Multiple R-squared:  0.5577, Adjusted R-squared:  0.5571 
## F-statistic: 918.6 on 2 and 1457 DF,  p-value: < 2.2e-16

plot(lm1)

# Confidence Inverval for mean response (for all values)
predict(lm, interval = "confidence")

##         fit      lwr      upr
## 1  26.77136 25.60404 27.93869
## 2  19.13292 17.46701 20.79883
## 3  32.50020 30.99537 34.00502
## 4  21.51993 20.10121 22.93865
## 5  25.33916 24.15600 26.52231
## 6  36.79682 34.78582 38.80783
## 7  13.88149 11.54052 16.22245
## 8  24.86175 23.66358 26.05992
## 9  22.95214 21.65008 24.25420
## 10 31.06799 29.69634 32.43964
## 11 20.56513 19.05425 22.07600
## 12 15.79110 13.70949 17.87271
## 13 41.09345 38.49164 43.69526
## 14 33.45500 31.84931 35.06070
## 15 28.68098 27.46587 29.89608
## 16 30.59059 29.25751 31.92366

#For new data:
Newdf = data.frame(GrLivArea = c(2000), BedroomAbvGr = c(3))
predict(lm1, Newdf, interval = "confidence")

##        fit      lwr      upr
## 1 239784.6 235956.9 243612.2

# Prediction Inverval for mean response
predict(lm1, Newdf, interval = "predict")

##        fit      lwr      upr
## 1 239784.6 136006.6 343562.5

#cooks.distance(lm1)

Bath = FullBath + 0.5*HalfBath
y = as.integer(format(Sys.Date(), format="%Y"))
Age = y-YearBuilt

house2=cbind(house, Bath, Age)
attach(house2)

## The following objects are masked _by_ .GlobalEnv:
## 
##     Age, Bath

## The following objects are masked from house:
## 
##     Alley, BedroomAbvGr, BldgType, BsmtCond, BsmtExposure,
##     BsmtFinSF1, BsmtFinSF2, BsmtFinType1, BsmtFinType2,
##     BsmtFullBath, BsmtHalfBath, BsmtQual, BsmtUnfSF, CentralAir,
##     Condition1, Condition2, Electrical, EnclosedPorch, ExterCond,
##     Exterior1st, Exterior2nd, ExterQual, Fence, FireplaceQu,
##     Fireplaces, Foundation, FullBath, Functional, GarageArea,
##     GarageCars, GarageCond, GarageFinish, GarageQual, GarageType,
##     GarageYrBlt, GrLivArea, HalfBath, Heating, HeatingQC,
##     HouseStyle, Id, KitchenAbvGr, KitchenQual, LandContour,
##     LandSlope, LotArea, LotConfig, LotFrontage, LotShape,
##     LowQualFinSF, MasVnrArea, MasVnrType, MiscFeature, MiscVal,
##     MoSold, Neighborhood, OpenPorchSF, OverallCond, OverallQual,
##     PavedDrive, PoolArea, PoolQC, RoofMatl, RoofStyle,
##     SaleCondition, SalePrice, SaleType, ScreenPorch, TotalBsmtSF,
##     TotRmsAbvGrd, Utilities, WoodDeckSF, X1stFlrSF, X2ndFlrSF,
##     X3SsnPorch, YearBuilt, YearRemodAdd, YrSold

## The following object is masked from package:BSDA:
## 
##     Heating

lm2=lm(SalePrice ~ BedroomAbvGr + Fireplaces +GrLivArea + Bath + LotArea + MoSold + TotalBsmtSF + Age + GarageCars + OverallCond + OverallQual, house2)
summary(lm2)

## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + Fireplaces + GrLivArea + 
##     Bath + LotArea + MoSold + TotalBsmtSF + Age + GarageCars + 
##     OverallCond + OverallQual, data = house2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -515515  -18240   -1494   14310  273537 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.441e+04  8.950e+03  -7.197 9.83e-13 ***
## BedroomAbvGr -7.620e+03  1.499e+03  -5.085 4.16e-07 ***
## Fireplaces    6.381e+03  1.761e+03   3.623 0.000302 ***
## GrLivArea     5.899e+01  3.772e+00  15.637  < 2e-16 ***
## Bath         -2.599e+03  2.734e+03  -0.951 0.341892    
## LotArea       5.542e-01  1.040e-01   5.327 1.15e-07 ***
## MoSold       -5.318e+01  3.572e+02  -0.149 0.881682    
## TotalBsmtSF   2.431e+01  2.959e+00   8.217 4.57e-16 ***
## Age          -4.717e+02  5.189e+01  -9.092  < 2e-16 ***
## GarageCars    1.327e+04  1.756e+03   7.555 7.40e-14 ***
## OverallCond   6.713e+03  9.572e+02   7.013 3.58e-12 ***
## OverallQual   1.785e+04  1.177e+03  15.171  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36720 on 1448 degrees of freedom
## Multiple R-squared:  0.7879, Adjusted R-squared:  0.7863 
## F-statistic: 489.1 on 11 and 1448 DF,  p-value: < 2.2e-16

plot(lm2)

library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:corrgram':
## 
##     auto

## The following object is masked from 'package:datasets':
## 
##     rivers

ols_step_backward_aic(lm2)

## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . BedroomAbvGr 
## 2 . Fireplaces 
## 3 . GrLivArea 
## 4 . Bath 
## 5 . LotArea 
## 6 . MoSold 
## 7 . TotalBsmtSF 
## 8 . Age 
## 9 . GarageCars 
## 10 . OverallCond 
## 11 . OverallQual 
## 
## 
## Variables Removed: 
## 
## - MoSold 
## - Bath 
## 
## No more variables to be removed.

## 
## 
##                           Backward Elimination Summary                           
## -------------------------------------------------------------------------------
## Variable         AIC           RSS            Sum Sq        R-Sq      Adj. R-Sq 
## -------------------------------------------------------------------------------
## Full Model    34849.793    1.952688e+12    7.255223e+12    0.78793      0.78632 
## MoSold        34847.816    1.952718e+12    7.255193e+12    0.78793      0.78647 
## Bath          34846.727    1.953937e+12    7.253975e+12    0.78780      0.78648 
## -------------------------------------------------------------------------------

ols_step_forward_aic(lm2)

## Forward Selection Method 
## ------------------------
## 
## Candidate Terms: 
## 
## 1 . BedroomAbvGr 
## 2 . Fireplaces 
## 3 . GrLivArea 
## 4 . Bath 
## 5 . LotArea 
## 6 . MoSold 
## 7 . TotalBsmtSF 
## 8 . Age 
## 9 . GarageCars 
## 10 . OverallCond 
## 11 . OverallQual 
## 
## 
## Variables Entered: 
## 
## - OverallQual 
## - GrLivArea 
## - TotalBsmtSF 
## - GarageCars 
## - Age 
## - OverallCond 
## - LotArea 
## - BedroomAbvGr 
## - Fireplaces 
## 
## No more variables to be added.

## 
##                                 Selection Summary                                 
## ---------------------------------------------------------------------------------
## Variable           AIC          Sum Sq           RSS          R-Sq      Adj. R-Sq 
## ---------------------------------------------------------------------------------
## OverallQual     35659.493    5.760947e+12    3.446964e+12    0.62565      0.62540 
## GrLivArea       35267.584    6.576044e+12    2.631868e+12    0.71417      0.71378 
## TotalBsmtSF     35119.663    6.832887e+12    2.375025e+12    0.74207      0.74154 
## GarageCars      35012.383    7.004165e+12    2.203746e+12    0.76067      0.76001 
## Age             34969.203    7.071316e+12    2.136595e+12    0.76796      0.76716 
## OverallCond     34923.147    7.140498e+12    2.067413e+12    0.77547      0.77455 
## LotArea         34889.198    7.190782e+12     2.01713e+12    0.78094      0.77988 
## BedroomAbvGr    34858.234    7.235814e+12    1.972098e+12    0.78583      0.78464 
## Fireplaces      34846.727    7.253975e+12    1.953937e+12    0.78780      0.78648 
## ---------------------------------------------------------------------------------

#If VIF >10 we have issues with multicollinearity
library(car)
vif(lm2)

## BedroomAbvGr   Fireplaces    GrLivArea         Bath      LotArea 
##     1.616956     1.395134     4.251037     3.271587     1.166563 
##       MoSold  TotalBsmtSF          Age   GarageCars  OverallCond 
##     1.009299     1.823162     2.656941     1.863399     1.227618 
##  OverallQual 
##     2.865238

Topic 14 Logistic Regression

So I couldn’t get the SAS data to load so I used the Regression analysis (408) example

Data=read.table("http://users.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%2014%20Data%20Sets/CH14PR13.txt")
names(Data) = c("Y", "X1", "X2") #X1 Annual income, X2 Current age Old car, Y: 1 purchased care
n=nrow(Data)

car_logit <- glm(Y~X1+X2,family=binomial(link=logit),data=Data)
summary(car_logit)

## 
## Call:
## glm(formula = Y ~ X1 + X2, family = binomial(link = logit), data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6189  -0.8949  -0.5880   0.9653   2.0846  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -4.73931    2.10195  -2.255   0.0242 *
## X1           0.06773    0.02806   2.414   0.0158 *
## X2           0.59863    0.39007   1.535   0.1249  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.987  on 32  degrees of freedom
## Residual deviance: 36.690  on 30  degrees of freedom
## AIC: 42.69
## 
## Number of Fisher Scoring iterations: 4

expb1 = exp(coef(car_logit)[2])
expb2 = exp(coef(car_logit)[3])
c(expb1,expb2)

##       X1       X2 
## 1.070079 1.819627

The odds of Purchasing a car increase by 7% for each increase in one thousand dollars of family income, if everything else remains constant.

The odds of Purchasing a car increase by 82% for each increase in one year of age of the oldest family automobile, if everything else remains constant.

exp(predict(car_logit,new=data.frame(X1=50,X2=3)))/(1+exp(predict(car_logit,new=data.frame(X1=50,X2=3))))

##         1 
## 0.6090245

#I don't know anything about the ROC curve sooooo.....

Topic 15 Repeated Measure

My Class did not get to this :(

Topic 16 Stratified Contingency Tables

library(stats)
#WWOOOOOOWWW! Would have been nice to know about this was of inputting earlier... UGH!
Input = ("
County       Sex     Result  Count
Bloom        Female  Pass     9
Bloom        Female  Fail     5
Bloom        Male    Pass     7
Bloom        Male    Fail    17
Cobblestone  Female  Pass    11
Cobblestone  Female  Fail    4
Cobblestone  Male    Pass    9
Cobblestone  Male    Fail    21
Dougal       Female  Pass     9
Dougal       Female  Fail     7
Dougal       Male    Pass    19
Dougal       Male    Fail     9
Heimlich     Female  Pass    15
Heimlich     Female  Fail     8
Heimlich     Male    Pass    14
Heimlich     Male    Fail    17
")
Data = read.table(textConnection(Input),header=TRUE)

### Order factors otherwise R will alphabetize them
Data$County = factor(Data$County, levels=unique(Data$County))
Data$Sex  = factor(Data$Sex, levels=unique(Data$Sex))
Data$Result = factor(Data$Result, levels=unique(Data$Result))

### Convert data to a table
Table = xtabs(Count ~ Sex + Result + County, 
              data=Data)

###  Note that the grouping variable is last in the xtabs function
ftable(Table)                     # Display a flattened table

##               County Bloom Cobblestone Dougal Heimlich
## Sex    Result                                         
## Female Pass              9          11      9       15
##        Fail              5           4      7        8
## Male   Pass              7           9     19       14
##        Fail             17          21      9       17

#Cochran–Mantel–Haenszel test
mantelhaen.test(Table)

## 
##  Mantel-Haenszel chi-squared test with continuity correction
## 
## data:  Table
## Mantel-Haenszel X-squared = 6.7314, df = 1, p-value = 0.009473
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.248399 4.260003
## sample estimates:
## common odds ratio 
##          2.306119

#Woolf test
library(vcd)

## Loading required package: grid

## 
## Attaching package: 'vcd'

## The following object is masked from 'package:BSDA':
## 
##     Trucks

woolf_test(Table)

## 
##  Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.)
## 
## data:  Table
## X-squared = 7.1376, df = 3, p-value = 0.06764

Topic 17 Survival Analysis

My Class did not get to this either :(

Topic 18 SQL

data("BOD")
library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
##   Reason: image not found

## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/library/tcltk/libs//tcltk.so'' had status 1

## Could not load tcltk.  Will use slower R code instead.

## Loading required package: RSQLite

sqldf('SELECT demand FROM BOD')

##   demand
## 1    8.3
## 2   10.3
## 3   19.0
## 4   16.0
## 5   15.6
## 6   19.8

SQL options: SELECT column(s) FROM table-name | view-name WHERE expression HAVING expression ORDER BY column(s)

403 in R