Length=c(12, 3, 200, 30)
Width=c(18, 10, 50, NA)
Perimeter = 2*(Length + Width)
Subject = c("A101", "N342", "D527", "C339")
Sex = c("F", "M", "M", "F")
Age = c(13.1, 15.8, 12.7, 13.4)
Time = c(75, 67, 72, 74)
Color = c(NA,NA,NA,NA)
mtime = data.frame(Subject, Sex, Age, Time, Color)
#Sort by Time (Note need the comma)
mtime[order(Time),]
## Subject Sex Age Time Color
## 2 N342 M 15.8 67 NA
## 3 D527 M 12.7 72 NA
## 4 C339 F 13.4 74 NA
## 1 A101 F 13.1 75 NA
#Negative sign makes it descending
mtime[order(-Time),]
## Subject Sex Age Time Color
## 1 A101 F 13.1 75 NA
## 4 C339 F 13.4 74 NA
## 3 D527 M 12.7 72 NA
## 2 N342 M 15.8 67 NA
Subject = c("B324", "A712", "H308", "C119")
Sex = c("M", "M", "M", "F")
Age = c(12.5, 13.7, 13.1, 13.3)
Time = c(71, 73, 68, 75)
Color = c("Gray", "Brown", "White", "Gray")
MazeTime = data.frame(Subject, Sex, Age, Time, Color)
#Note, to combine I had to add a missing data Color to align variables
Combined = rbind(mtime, MazeTime); Combined
## Subject Sex Age Time Color
## 1 A101 F 13.1 75 <NA>
## 2 N342 M 15.8 67 <NA>
## 3 D527 M 12.7 72 <NA>
## 4 C339 F 13.4 74 <NA>
## 5 B324 M 12.5 71 Gray
## 6 A712 M 13.7 73 Brown
## 7 H308 M 13.1 68 White
## 8 C119 F 13.3 75 Gray
Subject = c("A101","N342", "D527", "C339")
Time2 = c(71, 68, 64, 67)
Retest = data.frame(Subject, Time2)
Merged = merge(mtime, Retest,by="Subject"); Merged
## Subject Sex Age Time Color Time2
## 1 A101 F 13.1 75 NA 71
## 2 C339 F 13.4 74 NA 67
## 3 D527 M 12.7 72 NA 64
## 4 N342 M 15.8 67 NA 68
#Note if there isn't a match it's deleted
Merged2 = merge(Combined, Retest,by="Subject"); Merged2
## Subject Sex Age Time Color Time2
## 1 A101 F 13.1 75 <NA> 71
## 2 C339 F 13.4 74 <NA> 67
## 3 D527 M 12.7 72 <NA> 64
## 4 N342 M 15.8 67 <NA> 68
MaleMice = subset(Combined, Sex=="M"); MaleMice
## Subject Sex Age Time Color
## 2 N342 M 15.8 67 <NA>
## 3 D527 M 12.7 72 <NA>
## 5 B324 M 12.5 71 Gray
## 6 A712 M 13.7 73 Brown
## 7 H308 M 13.1 68 White
machine = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1)
Deviation = c(-.0653, 0.0141, -.0702, -.0734, -.0649, -.0601, -.0631, -.0148, -.0731, -.0764, -.0275, -.0497, -.0741, -.0673, -.0573, -.0629, -.0671, -.0246, -.0222, -.0807, -.0621, -.0785, -.0544, -.0511, -.0138, -.0609, 0.0038, -.0758, -.0731, -.0455)
Parts = data.frame(machine, Deviation)
mean(Deviation)
## [1] -0.05306667
mean(subset(Parts, machine == 1)$Deviation)
## [1] -0.05213333
mean(subset(Parts, machine == 2)$Deviation)
## [1] -0.05446667
library(moments)
sd(Deviation)
## [1] 0.02543624
skewness(Deviation)
## [1] 1.192545
kurtosis(Deviation)
## [1] 3.393343
hist(Deviation)
lines(density(Deviation))
hist(Deviation, breaks = seq(from = -0.09, to = 0.03, by = 0.01))
qqnorm(Deviation)
library(car)
## Loading required package: carData
qqPlot(Deviation)
## [1] 2 27
boxplot(Deviation)
boxplot(Deviation~machine)
Data=read.table("TeacherSalary.txt")
names(Data) <- c("ID", "Gender", "Degree", "Years", "Salary")
attach(Data)
DegreeSummary <- table(Degree); DegreeSummary
## Degree
## BS MS PHD
## 10 6 3
#Or use factor
factor(Degree)
## [1] BS BS BS MS BS BS MS MS BS BS PHD MS PHD MS BS BS MS
## [18] BS PHD
## Levels: BS MS PHD
table(subset(Data, Gender == "M")$Degree)
##
## BS MS PHD
## 6 4 2
table(subset(Data, Gender == "F")$Degree)
##
## BS MS PHD
## 4 2 1
mytable = table(Gender,Degree) # A will be rows, B will be columns
mytable
## Degree
## Gender BS MS PHD
## F 4 2 1
## M 6 4 2
margin.table(mytable, 1) # A frequencies (summed over B)
## Gender
## F M
## 7 12
margin.table(mytable, 2) # B frequencies (summed over A)
## Degree
## BS MS PHD
## 10 6 3
prop.table(mytable) # cell percentages
## Degree
## Gender BS MS PHD
## F 0.21052632 0.10526316 0.05263158
## M 0.31578947 0.21052632 0.10526316
prop.table(mytable, 1) # row percentages
## Degree
## Gender BS MS PHD
## F 0.5714286 0.2857143 0.1428571
## M 0.5000000 0.3333333 0.1666667
prop.table(mytable, 2) # column percentages
## Degree
## Gender BS MS PHD
## F 0.4000000 0.3333333 0.3333333
## M 0.6000000 0.6666667 0.6666667
NEEDS WORK :(((
gender=c("F", "F", "F", "M", "M", "M")
degree = c("BS", "MS", "PHD", "BS", "MS", "PHD")
freq = c(4, 2, 1, 6, 4, 2)
degreebygender=data.frame("gender", "degree", "freq")
#df <- data.frame(var = gender, wt = freq)
library(questionr)
wtd.table(x = degreebygender$degree, weights = degreebygender$freq)
## < table of extent 0 >
barplot(table(Degree)) #Note factor doesn't work
barplot(table(Degree), horiz=TRUE)
barplot(table(Years))
barplot(table(Gender,Salary))
barplot(table(Salary,Gender), beside = TRUE)
pie(Years)
plot(Salary, Years)
library(car)
scatterplot(Salary ~ Years)
## Warning in Ops.factor(x[floor(d)], x[ceiling(d)]): '+' not meaningful for
## factors
## Warning in smoother(.x, .y, col = col[1], log.x = logged("x"), log.y =
## logged("y"), : could not fit smooth
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
Hours=c(13, 8, 18, 11, 11, 10, 15, 9, 17, 14)
summary(Hours)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 10.25 12.00 12.60 14.75 18.00
mean(Hours)
## [1] 12.6
sd(Hours)
## [1] 3.373096
t.test(Hours,mu=11.8, conf.level = .98) # Ho: mu=11.8
##
## One Sample t-test
##
## data: Hours
## t = 0.75, df = 9, p-value = 0.4724
## alternative hypothesis: true mean is not equal to 11.8
## 98 percent confidence interval:
## 9.590466 15.609534
## sample estimates:
## mean of x
## 12.6
help(t.test)
#One Sided
t.test(Hours,mu=11.8, alternative = "greater")
##
## One Sample t-test
##
## data: Hours
## t = 0.75, df = 9, p-value = 0.2362
## alternative hypothesis: true mean is greater than 11.8
## 95 percent confidence interval:
## 10.64468 Inf
## sample estimates:
## mean of x
## 12.6
t.test(Hours,mu=11.8, alternative = "less")
##
## One Sample t-test
##
## data: Hours
## t = 0.75, df = 9, p-value = 0.7638
## alternative hypothesis: true mean is less than 11.8
## 95 percent confidence interval:
## -Inf 14.55532
## sample estimates:
## mean of x
## 12.6
T25 = c(49, 34, 24, 32, 52, 14, 28, 18, 28, 47, 60)
T35 = c(28, 55, 45, 51, 41, 27, 44, 48, 54, 67, 46, 59)
Boxplot(T25)
Boxplot(T35)
var.test(T25,T35) #Since not significant difference in variance we can use pooled.
##
## F test to compare two variances
##
## data: T25 and T35
## F = 1.6393, num df = 10, denom df = 11, p-value = 0.4294
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4649565 6.0078345
## sample estimates:
## ratio of variances
## 1.639284
t.test(T25,T35, var.equal = TRUE)
##
## Two Sample t-test
##
## data: T25 and T35
## t = -2.165, df = 21, p-value = 0.04205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.5116483 -0.4732002
## sample estimates:
## mean of x mean of y
## 35.09091 47.08333
t.test(T25,T35)
##
## Welch Two Sample t-test
##
## data: T25 and T35
## t = -2.1413, df = 18.93, p-value = 0.04548
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.7175653 -0.2672832
## sample estimates:
## mean of x mean of y
## 35.09091 47.08333
Baseline=c(6.37, 5.69, 5.58, 5.27, 5.11, 4.89, 4.7, 3.53)
Caffeine = c( 4.52, 5.44, 4.7, 3.81, 4.06, 3.22, 2.96, 3.2)
t.test(Baseline, Caffeine, paired = TRUE)
##
## Paired t-test
##
## data: Baseline and Caffeine
## t = 5.1878, df = 7, p-value = 0.00127
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.6278643 1.6796357
## sample estimates:
## mean of the differences
## 1.15375
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following objects are masked from 'Data':
##
## Degree, Salary
## The following objects are masked from 'package:carData':
##
## Vocab, Wool
## The following object is masked from 'package:datasets':
##
## Orange
A=c( 276.3, 288.4, 280.2, 294.7, 268.7, 282.8, 276.1, 179)
B=c( 275.1, 286.8, 277.3, 290.6, 269.1, 281, 275.3, 179.1)
SIGN.test(A,B)
##
## Dependent-samples Sign-Test
##
## data: A and B
## S = 6, p-value = 0.2891
## alternative hypothesis: true median difference is not equal to 0
## 95 percent confidence interval:
## -0.1975 3.2900
## sample estimates:
## median of x-y
## 1.4
##
## Achieved and Interpolated Confidence Intervals:
##
## Conf.Level L.E.pt U.E.pt
## Lower Achieved CI 0.9297 -0.1000 2.90
## Interpolated CI 0.9500 -0.1975 3.29
## Upper Achieved CI 0.9922 -0.4000 4.10
wilcox.test(A, B, paired = TRUE) #Wilcoxon Signed Rank Test
##
## Wilcoxon signed rank test
##
## data: A and B
## V = 33, p-value = 0.03906
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(A, B, paired = FALSE) #Wilcoxon Sum Rank Test
##
## Wilcoxon rank sum test
##
## data: A and B
## W = 35, p-value = 0.7984
## alternative hypothesis: true location shift is not equal to 0
year = seq.int(from = 1996, to = 2007)
csco = c(0.704, 0.314, 1.50, 1.31, -0.286, -0.527, -0.277, 0.85, -0.203, 0.029, 0.434, 0.044)
wd = c(0.204, 0.448, -0.080, -0.015, -0.004, -0.277, -0.203, 0.444, 0.202, -0.129, 0.443, -0.043)
ge = c(0.565, 0.587, 0.451, 0.574, -0.055, -0.151, -0.377, 0.308, 0.207, -0.014, 0.093, 0.126)
xom = c(0.405, 0.342, 0.254, 0.151, 0.127, -0.066, -0.089, 0.206, 0.281, 0.118, 0.391, 0.243)
teco = c(-0.012, 0.223, 0.050, -0.303, 0.849, -0.150, -0.369, 0.004, 0.128, 0.170, 0.051, 0.058)
stockreturn = data.frame(year, csco, wd, ge, xom, teco)
help(cor)
cor(stockreturn)
## year csco wd ge xom teco
## year 1.00000000 -0.3935922 -0.02930211 -0.54302353 -0.1131893 -0.02207881
## csco -0.39359224 1.0000000 0.29563711 0.76624782 0.4635722 -0.24593499
## wd -0.02930211 0.2956371 1.00000000 0.53453465 0.7716692 0.20512889
## ge -0.54302353 0.7662478 0.53453465 1.00000000 0.7263548 -0.00905595
## xom -0.11318934 0.4635722 0.77166915 0.72635475 1.0000000 0.26481697
## teco -0.02207881 -0.2459350 0.20512889 -0.00905595 0.2648170 1.00000000
plot(stockreturn)
cor.test(csco, wd) #I couldn't figure out how to do more than 2 at a time
##
## Pearson's product-moment correlation
##
## data: csco and wd
## t = 0.97863, df = 10, p-value = 0.3508
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3351238 0.7434073
## sample estimates:
## cor
## 0.2956371
library("ggpubr")
## Loading required package: ggplot2
## Loading required package: magrittr
ggscatter(stockreturn, x = "csco", y = "wd", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Cisco Stock", ylab = "Walt Disney Stock")
#Interesting but not great
library(corrgram)
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
corrgram(stockreturn)
dist = c(3.4, 1.8, 4.6, 2.3, 3.1, 5.5, 0.7, 3.0, 2.6, 4.3, 2.1, 1.1, 6.4, 4.8, 3.8, 4.2)
damage = c(26.2, 17.8, 31.3, 23.1, 27.5, 36.0, 14.1, 22.3, 19.6, 31.3, 24.0, 17.3, 43.2, 36.4, 26.1, 28.8)
firedamage = data.frame(dist, damage)
lm = lm(damage~dist)
summary(lm)
##
## Call:
## lm(formula = damage ~ dist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3521 -1.4473 -0.1764 1.7117 3.4349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.5397 1.3135 8.024 1.32e-06 ***
## dist 4.7740 0.3562 13.402 2.24e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.176 on 14 degrees of freedom
## Multiple R-squared: 0.9277, Adjusted R-squared: 0.9225
## F-statistic: 179.6 on 1 and 14 DF, p-value: 2.237e-09
plot(lm) #diagnostics
leveragePlots(lm)
ggscatter(firedamage, x = "dist", y = "damage", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Distance From Firehouse (miles)", ylab = "Damage ($1000)")
# Confidence Inverval for mean response
Newdist = data.frame(dist = 5)
predict(lm, Newdist, interval = "confidence")
## fit lwr upr
## 1 34.40981 32.69554 36.12408
# Prediction Inverval for mean response
predict(lm, Newdist, interval = "predict")
## fit lwr upr
## 1 34.40981 29.43758 39.38204
Yr = c(1913, 1923, 1929, 1933, 1943, 1953,1963, 1973, 1983, 1993, 1999, 2003, 2007, 2008, 2013)
Dow = c(1886, 1252, 4556, 1694, 1770, 2367, 5761, 4161, 2715, 5727, 14690, 12018, 14315, 9421, 16123)
djia = data.frame(Yr, Dow)
fit = lm(Dow~Yr)
plot(fit)
ggscatter(djia, x = "Yr", y = "Dow", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Year", ylab = "Dow")
Yr2= Yr^2
quad = lm(Dow~Yr+Yr2)
summary(quad)
##
## Call:
## lm(formula = Dow ~ Yr + Yr2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3742.8 -1086.8 -51.5 1519.8 4351.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.491e+06 3.094e+06 2.745 0.0178 *
## Yr -8.764e+03 3.149e+03 -2.783 0.0166 *
## Yr2 2.262e+00 8.012e-01 2.823 0.0154 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2517 on 12 degrees of freedom
## Multiple R-squared: 0.8079, Adjusted R-squared: 0.7759
## F-statistic: 25.24 on 2 and 12 DF, p-value: 5.021e-05
plot(Dow~Yr)
lines(smooth.spline(Yr, predict(quad)))
Yr3= Yr^3
cubic = lm(Dow~Yr+Yr2+Yr3)
summary(cubic)
##
## Call:
## lm(formula = Dow ~ Yr + Yr2 + Yr3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3931.3 -970.7 63.6 1091.9 4914.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.329e+08 2.261e+08 -1.030 0.325
## Yr 3.602e+05 3.455e+05 1.043 0.319
## Yr2 -1.857e+02 1.760e+02 -1.055 0.314
## Yr3 3.191e-02 2.987e-02 1.068 0.308
##
## Residual standard error: 2502 on 11 degrees of freedom
## Multiple R-squared: 0.826, Adjusted R-squared: 0.7785
## F-statistic: 17.4 on 3 and 11 DF, p-value: 0.0001735
plot(Dow~Yr)
lines(smooth.spline(Yr, predict(cubic)))
lny=log(Dow)
logfit= lm(lny~Yr)
plot(lny~Yr)
lines(smooth.spline(Yr, predict(logfit)))
yhat=exp(predict(logfit))
plot(Dow~Yr)
lines(smooth.spline(Yr, yhat))
points(Yr, yhat,col=3)
#help(points)
I could not figure this out. I’m a failure :(
boxCox(lm)
You’re welcome, because figuring this out was a pain in the ASS!
library(nlmrt)
x = c(4,6, 8, 10, 12, 14, 16, 18, 20)
y=c(0.2,0.7,1.4,2.8,5.8,11.1,21.7,43.2,80.1)
data=data.frame(x,y)
model = wrapnls(formula = y~ a*exp(b*x), data = data, start = c(a = 0.04, b=0.4))
#help(wrapnls)
#Reading this in and separating it was POOP! UGH
d=read.table("DecodingData.txt")
df = cbind(read.fwf(file = textConnection(as.character(d[, 1])), widths = c(2, 1, 1, 1, 1), colClasses = "character", col.names = c("a", "s", "ys", "cf", "p")))
#This separates the data into a matrix, not ideal
df2 <- cbind(a=substr(d[, 1],1,2), s=substr(d[, 1],3,3), ys=substr(d[, 1],4,4), cf=substr(d[, 1],5,5), p=substr(d[, 1],6,6))
# df = dataframe
# old.var.name = The name you don't like anymore
# new.var.name = The name you want to get
names(df)[names(df) == 'old.var.name'] <- 'new.var.name'
names(df)[names(df) == 'a'] <- 'Age'
names(df)[names(df) == 's'] <- 'Sex'
names(df)[names(df) == 'ys'] <- 'Years in College'
df
## Age Sex Years in College cf p
## 1 18 1 1 4 3
## 2 22 1 4 2 3
## 3 19 2 1 2 4
## 4 20 2 2 1 2
## 5 24 1 5 1 4
## 6 23 2 3 3 5
## 7 17 2 1 2 .
## 8 26 2 3 2 4
## 9 19 . 2 3 4
## 10 20 2 2 4 2
## 11 21 2 3 2 2
## 12 28 1 5 . .
## 13 18 2 1 3 1
## 14 20 2 2 1 4
## 15 22 1 4 . 3
## 16 23 2 4 2 3
## 17 20 1 1 3 5
## 18 20 1 4 2 4
## 19 27 2 5 3 4
## 20 21 2 4 1 2
## 21 18 2 1 2 3
## 22 18 2 1 3 3
## 23 22 2 4 2 4
## 24 20 1 2 4 5
## 25 25 2 1 3 3
## 26 19 1 1 4 3
## 27 21 3 1 . <NA>
## 28 19 2 2 2 3
## 29 22 2 5 2 2
## 30 21 1 4 4 4
## 31 19 2 2 3 1
df$Sex <- ordered(df$Sex, levels = c("1","2"), labels = c("Male", "Female"))
df$`Years in College` <- ordered(df$`Years in College`, levels = c("1","2","3","4","5"), labels = c("Freshman", "Sophomore", "Junior", "Senior", "Grad Student"))
df$cf <- ordered(df$cf, levels = c("1","2","3","4","5"), labels = c("Poor","Fair","Adequate","Good","Excellent"))
df$p <- ordered(df$p, levels = c("1","2","3","4","5"), labels = c("Poor","Fair","Adequate","Good","Excellent"))
df
## Age Sex Years in College cf p
## 1 18 Male Freshman Good Adequate
## 2 22 Male Senior Fair Adequate
## 3 19 Female Freshman Fair Good
## 4 20 Female Sophomore Poor Fair
## 5 24 Male Grad Student Poor Good
## 6 23 Female Junior Adequate Excellent
## 7 17 Female Freshman Fair <NA>
## 8 26 Female Junior Fair Good
## 9 19 <NA> Sophomore Adequate Good
## 10 20 Female Sophomore Good Fair
## 11 21 Female Junior Fair Fair
## 12 28 Male Grad Student <NA> <NA>
## 13 18 Female Freshman Adequate Poor
## 14 20 Female Sophomore Poor Good
## 15 22 Male Senior <NA> Adequate
## 16 23 Female Senior Fair Adequate
## 17 20 Male Freshman Adequate Excellent
## 18 20 Male Senior Fair Good
## 19 27 Female Grad Student Adequate Good
## 20 21 Female Senior Poor Fair
## 21 18 Female Freshman Fair Adequate
## 22 18 Female Freshman Adequate Adequate
## 23 22 Female Senior Fair Good
## 24 20 Male Sophomore Good Excellent
## 25 25 Female Freshman Adequate Adequate
## 26 19 Male Freshman Good Adequate
## 27 21 <NA> Freshman <NA> <NA>
## 28 19 Female Sophomore Fair Adequate
## 29 22 Female Grad Student Fair Fair
## 30 21 Male Senior Good Good
## 31 19 Female Sophomore Adequate Poor
#Pro Tip: & is logical "AND", while | is logical "or"- vertical bar below delete key
Status = ifelse((df$`Years in College` == "Freshman") | (df$`Years in College` == "Sophomore"), "Underclassman", ifelse((df$`Years in College` == "Junior") | (df$`Years in College` == "Senior"), "Upperclassman", "Graduate"))
dfnew=data.frame(df,Status)
dfnew
## Age Sex Years.in.College cf p Status
## 1 18 Male Freshman Good Adequate Underclassman
## 2 22 Male Senior Fair Adequate Upperclassman
## 3 19 Female Freshman Fair Good Underclassman
## 4 20 Female Sophomore Poor Fair Underclassman
## 5 24 Male Grad Student Poor Good Graduate
## 6 23 Female Junior Adequate Excellent Upperclassman
## 7 17 Female Freshman Fair <NA> Underclassman
## 8 26 Female Junior Fair Good Upperclassman
## 9 19 <NA> Sophomore Adequate Good Underclassman
## 10 20 Female Sophomore Good Fair Underclassman
## 11 21 Female Junior Fair Fair Upperclassman
## 12 28 Male Grad Student <NA> <NA> Graduate
## 13 18 Female Freshman Adequate Poor Underclassman
## 14 20 Female Sophomore Poor Good Underclassman
## 15 22 Male Senior <NA> Adequate Upperclassman
## 16 23 Female Senior Fair Adequate Upperclassman
## 17 20 Male Freshman Adequate Excellent Underclassman
## 18 20 Male Senior Fair Good Upperclassman
## 19 27 Female Grad Student Adequate Good Graduate
## 20 21 Female Senior Poor Fair Upperclassman
## 21 18 Female Freshman Fair Adequate Underclassman
## 22 18 Female Freshman Adequate Adequate Underclassman
## 23 22 Female Senior Fair Good Upperclassman
## 24 20 Male Sophomore Good Excellent Underclassman
## 25 25 Female Freshman Adequate Adequate Underclassman
## 26 19 Male Freshman Good Adequate Underclassman
## 27 21 <NA> Freshman <NA> <NA> Underclassman
## 28 19 Female Sophomore Fair Adequate Underclassman
## 29 22 Female Grad Student Fair Fair Graduate
## 30 21 Male Senior Good Good Upperclassman
## 31 19 Female Sophomore Adequate Poor Underclassman
agegroup = cut(as.numeric(df$Age), breaks = c(0, 16, 19, 21, 23, 27, max(df$Age)), labels = c("Out of Range", "17-19", "20-21", "22-23", "24-27", "28 and up"), right = TRUE)
dfnew2 = data.frame(dfnew, agegroup); dfnew2
## Age Sex Years.in.College cf p Status agegroup
## 1 18 Male Freshman Good Adequate Underclassman 17-19
## 2 22 Male Senior Fair Adequate Upperclassman 22-23
## 3 19 Female Freshman Fair Good Underclassman 17-19
## 4 20 Female Sophomore Poor Fair Underclassman 20-21
## 5 24 Male Grad Student Poor Good Graduate 24-27
## 6 23 Female Junior Adequate Excellent Upperclassman 22-23
## 7 17 Female Freshman Fair <NA> Underclassman 17-19
## 8 26 Female Junior Fair Good Upperclassman 24-27
## 9 19 <NA> Sophomore Adequate Good Underclassman 17-19
## 10 20 Female Sophomore Good Fair Underclassman 20-21
## 11 21 Female Junior Fair Fair Upperclassman 20-21
## 12 28 Male Grad Student <NA> <NA> Graduate 28 and up
## 13 18 Female Freshman Adequate Poor Underclassman 17-19
## 14 20 Female Sophomore Poor Good Underclassman 20-21
## 15 22 Male Senior <NA> Adequate Upperclassman 22-23
## 16 23 Female Senior Fair Adequate Upperclassman 22-23
## 17 20 Male Freshman Adequate Excellent Underclassman 20-21
## 18 20 Male Senior Fair Good Upperclassman 20-21
## 19 27 Female Grad Student Adequate Good Graduate 24-27
## 20 21 Female Senior Poor Fair Upperclassman 20-21
## 21 18 Female Freshman Fair Adequate Underclassman 17-19
## 22 18 Female Freshman Adequate Adequate Underclassman 17-19
## 23 22 Female Senior Fair Good Upperclassman 22-23
## 24 20 Male Sophomore Good Excellent Underclassman 20-21
## 25 25 Female Freshman Adequate Adequate Underclassman 24-27
## 26 19 Male Freshman Good Adequate Underclassman 17-19
## 27 21 <NA> Freshman <NA> <NA> Underclassman 20-21
## 28 19 Female Sophomore Fair Adequate Underclassman 17-19
## 29 22 Female Grad Student Fair Fair Graduate 22-23
## 30 21 Male Senior Good Good Upperclassman 20-21
## 31 19 Female Sophomore Adequate Poor Underclassman 17-19
plants = c(54, 122, 58)
gof=chisq.test(plants, p=c(1/4, 1/2, 1/4));gof #1:2:1 Expected ratio
##
## Chi-squared test for given probabilities
##
## data: plants
## X-squared = 0.5641, df = 2, p-value = 0.7542
gof=chisq.test(plants, p=c(.25, .50, .25)) #probabilities need to sum to 1
gof$expected
## [1] 58.5 117.0 58.5
#note that the table feature from before would be fine instead of Matrix
#Input data as a Matrix
matrix=matrix(c(24, 289, 9, 100, 13, 565), nrow = 2)
colnames(matrix)=c("No Vacine", "One Shot", "Two Shot")
rownames(matrix)=c("Flu", "No Flu")
matrix
## No Vacine One Shot Two Shot
## Flu 24 9 13
## No Flu 289 100 565
cstest=chisq.test(matrix);cstest
##
## Pearson's Chi-squared test
##
## data: matrix
## X-squared = 17.313, df = 2, p-value = 0.000174
cstest$expected
## No Vacine One Shot Two Shot
## Flu 14.398 5.014 26.588
## No Flu 298.602 103.986 551.412
cstest$residuals
## No Vacine One Shot Two Shot
## Flu 2.5305249 1.7801030 -2.635195
## No Flu -0.5556679 -0.3908858 0.578652
#If expected values are not >= 5, use Fisher's Exact Test:
fisher.test(matrix)
##
## Fisher's Exact Test for Count Data
##
## data: matrix
## p-value = 0.0001072
## alternative hypothesis: two.sided
M=matrix(c(237, 197, 3489, 5870), nrow=2)
colnames(M)=c("Low", "Normal")
rownames(M)=c("Smoker", "Non Smoker")
M
## Low Normal
## Smoker 237 3489
## Non Smoker 197 5870
barplot(M)
barplot(M, beside = TRUE)
library(epiR)
## Loading required package: survival
## Package epiR 0.9-99 is loaded
## Type help(epi.about) for summary information
##
epi.2by2(M)
## Outcome + Outcome - Total Inc risk *
## Exposed + 237 3489 3726 6.36
## Exposed - 197 5870 6067 3.25
## Total 434 9359 9793 4.43
## Odds
## Exposed + 0.0679
## Exposed - 0.0336
## Total 0.0464
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio 1.96 (1.63, 2.36)
## Odds ratio 2.02 (1.67, 2.46)
## Attrib risk * 3.11 (2.21, 4.02)
## Attrib risk in population * 1.18 (0.58, 1.79)
## Attrib fraction in exposed (%) 48.95 (38.61, 57.55)
## Attrib fraction in population (%) 26.73 (18.95, 33.77)
## -------------------------------------------------------------------
## X2 test statistic: 52.838 p-value: < 0.001
## Wald confidence limits
## * Outcomes per 100 population units
MCMData=matrix(c(26, 7, 15, 37), nrow=2)
colnames(MCMData)=c("Sibling (Control) yes", "no" )
rownames(MCMData)=c("Hodgekin Patient yes", "No")
MCMData
## Sibling (Control) yes no
## Hodgekin Patient yes 26 15
## No 7 37
mcnemar.test(MCMData)
##
## McNemar's Chi-squared test with continuity correction
##
## data: MCMData
## McNemar's chi-squared = 2.2273, df = 1, p-value = 0.1356
Kappa=matrix(c(9, 3, 6, 12), nrow=2)
colnames(Kappa)=c("yes", "no" )
rownames(Kappa)=c("yes", "No")
Kappa
## yes no
## yes 9 6
## No 3 12
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:questionr':
##
## describe
## The following object is masked from 'package:car':
##
## logit
cohen.kappa(Kappa)
## Call: cohen.kappa1(x = x, w = w, n.obs = n.obs, alpha = alpha, levels = levels)
##
## Cohen Kappa and Weighted Kappa correlation coefficients and confidence boundaries
## lower estimate upper
## unweighted kappa 0.079 0.4 0.72
## weighted kappa 0.079 0.4 0.72
##
## Number of subjects = 30
date()
## [1] "Sat Feb 9 14:20:39 2019"
t = Sys.Date();t #Today
## [1] "2019-02-09"
format(t, format="%B %d")
## [1] "February 09"
# convert date info in format 'mm/dd/yyyy'
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y");dates
## [1] "1965-01-05" "1975-08-16"
invest=20000
rate = 0.07
for (i in 1:5) {
invest=invest*(1+rate)
print(invest)
}
## [1] 21400
## [1] 22898
## [1] 24500.86
## [1] 26215.92
## [1] 28051.03
scores = c( 101,118,98,107,116, 151,149,121,112,127,138, 119,109,198,186,160,134)
group = c("A","A","A","A","A","B","B","B","B","B","B","C","C","C","C","C","C")
videogame=data.frame(scores,group);videogame
## scores group
## 1 101 A
## 2 118 A
## 3 98 A
## 4 107 A
## 5 116 A
## 6 151 B
## 7 149 B
## 8 121 B
## 9 112 B
## 10 127 B
## 11 138 B
## 12 119 C
## 13 109 C
## 14 198 C
## 15 186 C
## 16 160 C
## 17 134 C
library("ggpubr")
ggboxplot(videogame, x = "group", y = "scores")
ggboxplot(videogame, x = "group", y = "scores", color = "group", order = c("A", "B", "C"), ylab = "Scores", xlab = "group")
ggline(videogame, x = "group", y = "scores", add = c("mean_se", "jitter"))
anova <- aov(scores ~ group, data = videogame)
# Summary of the analysis
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 5052 2526.0 4.349 0.034 *
## Residuals 14 8132 580.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Tukey Post hoc Test
TukeyHSD(anova)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = scores ~ group, data = videogame)
##
## $group
## diff lwr upr p adj
## B-A 25 -13.196243 63.19624 0.2349865
## C-A 43 4.803757 81.19624 0.0269286
## C-B 18 -18.418689 54.41869 0.4215527
#If Anova test assumptions are not met, non-paramtric Kruskal_Wallis Test:
kruskal.test(scores ~ group, data = videogame)
##
## Kruskal-Wallis rank sum test
##
## data: scores by group
## Kruskal-Wallis chi-squared = 7.7203, df = 2, p-value = 0.02107
#So I learned that I don't care about loops for data entry..... I am assuming that there's an easier way to do this, but I don't care to learn atm
alz = c(21, 24, 22, 18, 20, 9, 7, 9, 8, 12, 12, 15, 18, 20, 19, 15, 8, 12, 13, 11)
stage = c("Early","Early","Early","Early","Early","Early","Early","Early","Early","Early","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle")
music=c("Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart","Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart")
alzdf=data.frame(alz,stage,music); alzdf
## alz stage music
## 1 21 Early Piano
## 2 24 Early Piano
## 3 22 Early Piano
## 4 18 Early Piano
## 5 20 Early Piano
## 6 9 Early Mozart
## 7 7 Early Mozart
## 8 9 Early Mozart
## 9 8 Early Mozart
## 10 12 Early Mozart
## 11 12 Middle Piano
## 12 15 Middle Piano
## 13 18 Middle Piano
## 14 20 Middle Piano
## 15 19 Middle Piano
## 16 15 Middle Mozart
## 17 8 Middle Mozart
## 18 12 Middle Mozart
## 19 13 Middle Mozart
## 20 11 Middle Mozart
library("ggpubr")
ggboxplot(alzdf, x = "stage", y = "alz", color = "music")
ggboxplot(alzdf, x = "music", y = "alz", color = "stage")
library("ggpubr")
ggline(alzdf, x = "stage", y = "alz", color = "music",add = c("mean_se", "dotplot"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
ggline(alzdf, x = "music", y = "alz", color = "stage",add = c("mean_se", "dotplot"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
#Without Interaction
anova2 <- aov(alz ~ stage + music, data = alzdf)
summary(anova2)
## Df Sum Sq Mean Sq F value Pr(>F)
## stage 1 2.5 2.5 0.253 0.622
## music 1 361.3 361.3 37.254 1.17e-05 ***
## Residuals 17 164.9 9.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#With Interaction
anova3 <- aov(alz ~ stage + music +stage:music, data = alzdf)
summary(anova3)
## Df Sum Sq Mean Sq F value Pr(>F)
## stage 1 2.5 2.5 0.378 0.54712
## music 1 361.3 361.3 55.792 1.34e-06 ***
## stage:music 1 61.3 61.3 9.459 0.00724 **
## Residuals 16 103.6 6.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Tukey
TukeyHSD(anova3, which = "stage")
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
##
## $stage
## diff lwr upr p adj
## Middle-Early -0.7 -3.112411 1.712411 0.5471223
TukeyHSD(anova3, which = "music")
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
##
## $music
## diff lwr upr p adj
## Piano-Mozart 8.5 6.087589 10.91241 1.3e-06
alz = c(21, 24, 22, 18, 20, 9, 7, 9, 8, 12, 12, 15, 18, 20, 19, 15, 8, 12, 13, 11,29,26,30,24,26,17,19,21,18,16)
stage = c("Early","Early","Early","Early","Early","Early","Early","Early","Early","Early","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Middle","Early","Early","Early","Early","Early","Middle","Middle","Middle","Middle","Middle")
music=c("Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart","Piano","Piano","Piano","Piano","Piano","Mozart","Mozart","Mozart","Mozart","Mozart","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening","Easy Listening")
alzdf=data.frame(alz,stage,music); alzdf
## alz stage music
## 1 21 Early Piano
## 2 24 Early Piano
## 3 22 Early Piano
## 4 18 Early Piano
## 5 20 Early Piano
## 6 9 Early Mozart
## 7 7 Early Mozart
## 8 9 Early Mozart
## 9 8 Early Mozart
## 10 12 Early Mozart
## 11 12 Middle Piano
## 12 15 Middle Piano
## 13 18 Middle Piano
## 14 20 Middle Piano
## 15 19 Middle Piano
## 16 15 Middle Mozart
## 17 8 Middle Mozart
## 18 12 Middle Mozart
## 19 13 Middle Mozart
## 20 11 Middle Mozart
## 21 29 Early Easy Listening
## 22 26 Early Easy Listening
## 23 30 Early Easy Listening
## 24 24 Early Easy Listening
## 25 26 Early Easy Listening
## 26 17 Middle Easy Listening
## 27 19 Middle Easy Listening
## 28 21 Middle Easy Listening
## 29 18 Middle Easy Listening
## 30 16 Middle Easy Listening
library("ggpubr")
ggboxplot(alzdf, x = "stage", y = "alz", color = "music")
ggboxplot(alzdf, x = "music", y = "alz", color = "stage")
library("ggpubr")
ggline(alzdf, x = "stage", y = "alz", color = "music",add = c("mean_se", "dotplot"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
ggline(alzdf, x = "music", y = "alz", color = "stage",add = c("mean_se", "dotplot"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
#Without Interaction
anova2 <- aov(alz ~ stage + music, data = alzdf)
summary(anova2)
## Df Sum Sq Mean Sq F value Pr(>F)
## stage 1 86.7 86.7 7.202 0.0125 *
## music 2 782.6 391.3 32.504 8.45e-08 ***
## Residuals 26 313.0 12.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#With Interaction
anova3 <- aov(alz ~ stage + music +stage:music, data = alzdf)
summary(anova3)
## Df Sum Sq Mean Sq F value Pr(>F)
## stage 1 86.7 86.7 14.61 0.000824 ***
## music 2 782.6 391.3 65.95 1.77e-10 ***
## stage:music 2 170.6 85.3 14.38 7.86e-05 ***
## Residuals 24 142.4 5.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Tukey
TukeyHSD(anova3, which = "stage")
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
##
## $stage
## diff lwr upr p adj
## Middle-Early -3.4 -5.235723 -1.564277 0.0008239
TukeyHSD(anova3, which = "music")
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = alz ~ stage + music + stage:music, data = alzdf)
##
## $music
## diff lwr upr p adj
## Mozart-Easy Listening -12.2 -14.920401 -9.4795995 0.0000000
## Piano-Easy Listening -3.7 -6.420401 -0.9795995 0.0064787
## Piano-Mozart 8.5 5.779599 11.2204005 0.0000001
library(car)
tIII.ANOVA <- aov(alz ~ stage + music +stage:music, data = alzdf)
Anova(tIII.ANOVA, type = "III")
## Anova Table (Type III tests)
##
## Response: alz
## Sum Sq Df F value Pr(>F)
## (Intercept) 3645.0 1 614.326 < 2.2e-16 ***
## stage 193.6 1 32.629 6.942e-06 ***
## music 840.0 2 70.787 8.603e-11 ***
## stage:music 170.6 2 14.376 7.863e-05 ***
## Residuals 142.4 24
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
house=read.csv("HouseSales.csv")
attach(house)
## The following object is masked from package:BSDA:
##
## Heating
lm1=lm(SalePrice~GrLivArea + BedroomAbvGr, house)
summary(lm1)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + BedroomAbvGr, data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -549234 -27252 -349 23139 298053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62686.434 5336.730 11.75 <2e-16 ***
## GrLivArea 128.899 3.087 41.76 <2e-16 ***
## BedroomAbvGr -26899.823 1988.164 -13.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52870 on 1457 degrees of freedom
## Multiple R-squared: 0.5577, Adjusted R-squared: 0.5571
## F-statistic: 918.6 on 2 and 1457 DF, p-value: < 2.2e-16
plot(lm1)
# Confidence Inverval for mean response (for all values)
predict(lm, interval = "confidence")
## fit lwr upr
## 1 26.77136 25.60404 27.93869
## 2 19.13292 17.46701 20.79883
## 3 32.50020 30.99537 34.00502
## 4 21.51993 20.10121 22.93865
## 5 25.33916 24.15600 26.52231
## 6 36.79682 34.78582 38.80783
## 7 13.88149 11.54052 16.22245
## 8 24.86175 23.66358 26.05992
## 9 22.95214 21.65008 24.25420
## 10 31.06799 29.69634 32.43964
## 11 20.56513 19.05425 22.07600
## 12 15.79110 13.70949 17.87271
## 13 41.09345 38.49164 43.69526
## 14 33.45500 31.84931 35.06070
## 15 28.68098 27.46587 29.89608
## 16 30.59059 29.25751 31.92366
#For new data:
Newdf = data.frame(GrLivArea = c(2000), BedroomAbvGr = c(3))
predict(lm1, Newdf, interval = "confidence")
## fit lwr upr
## 1 239784.6 235956.9 243612.2
# Prediction Inverval for mean response
predict(lm1, Newdf, interval = "predict")
## fit lwr upr
## 1 239784.6 136006.6 343562.5
#cooks.distance(lm1)
Bath = FullBath + 0.5*HalfBath
y = as.integer(format(Sys.Date(), format="%Y"))
Age = y-YearBuilt
house2=cbind(house, Bath, Age)
attach(house2)
## The following objects are masked _by_ .GlobalEnv:
##
## Age, Bath
## The following objects are masked from house:
##
## Alley, BedroomAbvGr, BldgType, BsmtCond, BsmtExposure,
## BsmtFinSF1, BsmtFinSF2, BsmtFinType1, BsmtFinType2,
## BsmtFullBath, BsmtHalfBath, BsmtQual, BsmtUnfSF, CentralAir,
## Condition1, Condition2, Electrical, EnclosedPorch, ExterCond,
## Exterior1st, Exterior2nd, ExterQual, Fence, FireplaceQu,
## Fireplaces, Foundation, FullBath, Functional, GarageArea,
## GarageCars, GarageCond, GarageFinish, GarageQual, GarageType,
## GarageYrBlt, GrLivArea, HalfBath, Heating, HeatingQC,
## HouseStyle, Id, KitchenAbvGr, KitchenQual, LandContour,
## LandSlope, LotArea, LotConfig, LotFrontage, LotShape,
## LowQualFinSF, MasVnrArea, MasVnrType, MiscFeature, MiscVal,
## MoSold, Neighborhood, OpenPorchSF, OverallCond, OverallQual,
## PavedDrive, PoolArea, PoolQC, RoofMatl, RoofStyle,
## SaleCondition, SalePrice, SaleType, ScreenPorch, TotalBsmtSF,
## TotRmsAbvGrd, Utilities, WoodDeckSF, X1stFlrSF, X2ndFlrSF,
## X3SsnPorch, YearBuilt, YearRemodAdd, YrSold
## The following object is masked from package:BSDA:
##
## Heating
lm2=lm(SalePrice ~ BedroomAbvGr + Fireplaces +GrLivArea + Bath + LotArea + MoSold + TotalBsmtSF + Age + GarageCars + OverallCond + OverallQual, house2)
summary(lm2)
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + Fireplaces + GrLivArea +
## Bath + LotArea + MoSold + TotalBsmtSF + Age + GarageCars +
## OverallCond + OverallQual, data = house2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -515515 -18240 -1494 14310 273537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.441e+04 8.950e+03 -7.197 9.83e-13 ***
## BedroomAbvGr -7.620e+03 1.499e+03 -5.085 4.16e-07 ***
## Fireplaces 6.381e+03 1.761e+03 3.623 0.000302 ***
## GrLivArea 5.899e+01 3.772e+00 15.637 < 2e-16 ***
## Bath -2.599e+03 2.734e+03 -0.951 0.341892
## LotArea 5.542e-01 1.040e-01 5.327 1.15e-07 ***
## MoSold -5.318e+01 3.572e+02 -0.149 0.881682
## TotalBsmtSF 2.431e+01 2.959e+00 8.217 4.57e-16 ***
## Age -4.717e+02 5.189e+01 -9.092 < 2e-16 ***
## GarageCars 1.327e+04 1.756e+03 7.555 7.40e-14 ***
## OverallCond 6.713e+03 9.572e+02 7.013 3.58e-12 ***
## OverallQual 1.785e+04 1.177e+03 15.171 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36720 on 1448 degrees of freedom
## Multiple R-squared: 0.7879, Adjusted R-squared: 0.7863
## F-statistic: 489.1 on 11 and 1448 DF, p-value: < 2.2e-16
plot(lm2)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:corrgram':
##
## auto
## The following object is masked from 'package:datasets':
##
## rivers
ols_step_backward_aic(lm2)
## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . BedroomAbvGr
## 2 . Fireplaces
## 3 . GrLivArea
## 4 . Bath
## 5 . LotArea
## 6 . MoSold
## 7 . TotalBsmtSF
## 8 . Age
## 9 . GarageCars
## 10 . OverallCond
## 11 . OverallQual
##
##
## Variables Removed:
##
## - MoSold
## - Bath
##
## No more variables to be removed.
##
##
## Backward Elimination Summary
## -------------------------------------------------------------------------------
## Variable AIC RSS Sum Sq R-Sq Adj. R-Sq
## -------------------------------------------------------------------------------
## Full Model 34849.793 1.952688e+12 7.255223e+12 0.78793 0.78632
## MoSold 34847.816 1.952718e+12 7.255193e+12 0.78793 0.78647
## Bath 34846.727 1.953937e+12 7.253975e+12 0.78780 0.78648
## -------------------------------------------------------------------------------
ols_step_forward_aic(lm2)
## Forward Selection Method
## ------------------------
##
## Candidate Terms:
##
## 1 . BedroomAbvGr
## 2 . Fireplaces
## 3 . GrLivArea
## 4 . Bath
## 5 . LotArea
## 6 . MoSold
## 7 . TotalBsmtSF
## 8 . Age
## 9 . GarageCars
## 10 . OverallCond
## 11 . OverallQual
##
##
## Variables Entered:
##
## - OverallQual
## - GrLivArea
## - TotalBsmtSF
## - GarageCars
## - Age
## - OverallCond
## - LotArea
## - BedroomAbvGr
## - Fireplaces
##
## No more variables to be added.
##
## Selection Summary
## ---------------------------------------------------------------------------------
## Variable AIC Sum Sq RSS R-Sq Adj. R-Sq
## ---------------------------------------------------------------------------------
## OverallQual 35659.493 5.760947e+12 3.446964e+12 0.62565 0.62540
## GrLivArea 35267.584 6.576044e+12 2.631868e+12 0.71417 0.71378
## TotalBsmtSF 35119.663 6.832887e+12 2.375025e+12 0.74207 0.74154
## GarageCars 35012.383 7.004165e+12 2.203746e+12 0.76067 0.76001
## Age 34969.203 7.071316e+12 2.136595e+12 0.76796 0.76716
## OverallCond 34923.147 7.140498e+12 2.067413e+12 0.77547 0.77455
## LotArea 34889.198 7.190782e+12 2.01713e+12 0.78094 0.77988
## BedroomAbvGr 34858.234 7.235814e+12 1.972098e+12 0.78583 0.78464
## Fireplaces 34846.727 7.253975e+12 1.953937e+12 0.78780 0.78648
## ---------------------------------------------------------------------------------
#If VIF >10 we have issues with multicollinearity
library(car)
vif(lm2)
## BedroomAbvGr Fireplaces GrLivArea Bath LotArea
## 1.616956 1.395134 4.251037 3.271587 1.166563
## MoSold TotalBsmtSF Age GarageCars OverallCond
## 1.009299 1.823162 2.656941 1.863399 1.227618
## OverallQual
## 2.865238
So I couldn’t get the SAS data to load so I used the Regression analysis (408) example
Data=read.table("http://users.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%2014%20Data%20Sets/CH14PR13.txt")
names(Data) = c("Y", "X1", "X2") #X1 Annual income, X2 Current age Old car, Y: 1 purchased care
n=nrow(Data)
car_logit <- glm(Y~X1+X2,family=binomial(link=logit),data=Data)
summary(car_logit)
##
## Call:
## glm(formula = Y ~ X1 + X2, family = binomial(link = logit), data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6189 -0.8949 -0.5880 0.9653 2.0846
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.73931 2.10195 -2.255 0.0242 *
## X1 0.06773 0.02806 2.414 0.0158 *
## X2 0.59863 0.39007 1.535 0.1249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.987 on 32 degrees of freedom
## Residual deviance: 36.690 on 30 degrees of freedom
## AIC: 42.69
##
## Number of Fisher Scoring iterations: 4
expb1 = exp(coef(car_logit)[2])
expb2 = exp(coef(car_logit)[3])
c(expb1,expb2)
## X1 X2
## 1.070079 1.819627
The odds of Purchasing a car increase by 7% for each increase in one thousand dollars of family income, if everything else remains constant.
The odds of Purchasing a car increase by 82% for each increase in one year of age of the oldest family automobile, if everything else remains constant.
exp(predict(car_logit,new=data.frame(X1=50,X2=3)))/(1+exp(predict(car_logit,new=data.frame(X1=50,X2=3))))
## 1
## 0.6090245
#I don't know anything about the ROC curve sooooo.....
My Class did not get to this :(
library(stats)
#WWOOOOOOWWW! Would have been nice to know about this was of inputting earlier... UGH!
Input = ("
County Sex Result Count
Bloom Female Pass 9
Bloom Female Fail 5
Bloom Male Pass 7
Bloom Male Fail 17
Cobblestone Female Pass 11
Cobblestone Female Fail 4
Cobblestone Male Pass 9
Cobblestone Male Fail 21
Dougal Female Pass 9
Dougal Female Fail 7
Dougal Male Pass 19
Dougal Male Fail 9
Heimlich Female Pass 15
Heimlich Female Fail 8
Heimlich Male Pass 14
Heimlich Male Fail 17
")
Data = read.table(textConnection(Input),header=TRUE)
### Order factors otherwise R will alphabetize them
Data$County = factor(Data$County, levels=unique(Data$County))
Data$Sex = factor(Data$Sex, levels=unique(Data$Sex))
Data$Result = factor(Data$Result, levels=unique(Data$Result))
### Convert data to a table
Table = xtabs(Count ~ Sex + Result + County,
data=Data)
### Note that the grouping variable is last in the xtabs function
ftable(Table) # Display a flattened table
## County Bloom Cobblestone Dougal Heimlich
## Sex Result
## Female Pass 9 11 9 15
## Fail 5 4 7 8
## Male Pass 7 9 19 14
## Fail 17 21 9 17
#Cochran–Mantel–Haenszel test
mantelhaen.test(Table)
##
## Mantel-Haenszel chi-squared test with continuity correction
##
## data: Table
## Mantel-Haenszel X-squared = 6.7314, df = 1, p-value = 0.009473
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.248399 4.260003
## sample estimates:
## common odds ratio
## 2.306119
#Woolf test
library(vcd)
## Loading required package: grid
##
## Attaching package: 'vcd'
## The following object is masked from 'package:BSDA':
##
## Trucks
woolf_test(Table)
##
## Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.)
##
## data: Table
## X-squared = 7.1376, df = 3, p-value = 0.06764
My Class did not get to this either :(
data("BOD")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
## Reason: image not found
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/library/tcltk/libs//tcltk.so'' had status 1
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
sqldf('SELECT demand FROM BOD')
## demand
## 1 8.3
## 2 10.3
## 3 19.0
## 4 16.0
## 5 15.6
## 6 19.8
SQL options: SELECT column(s) FROM table-name | view-name WHERE expression HAVING expression ORDER BY column(s)