Created on 23 June 2013
Revised on Mon Jun 24 00:00:37 2013
Example: Movie ratings
require(RCurl)
## Loading required package: RCurl
## Loading required package: bitops
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/movies03RT.txt",
ssl.verifypeer = FALSE)
movies <- read.table(textConnection(myCsv), sep = "\t", header = T, quote = "")
head(movies)
## X score rating genre box.office running.time
## 1 2 Fast 2 Furious 48.9 PG-13 action/adventure 127.15 107
## 2 28 Days Later 78.2 R horror 45.06 113
## 3 A Guy Thing 39.5 PG-13 rom comedy 15.54 101
## 4 A Man Apart 42.9 R action/adventure 26.25 110
## 5 A Mighty Wind 79.9 PG-13 comedy 17.78 91
## 6 Agent Cody Banks 57.9 PG action/adventure 47.81 102
Rotton tomatoes score vs. rating
plot(movies$score ~ jitter(as.numeric(movies$rating)), col = "blue", xaxt = "n",
pch = 19)
axis(side = 1, at = unique(as.numeric(movies$rating)), labels = unique(movies$rating))
Average score by rating
plot(movies$score ~ jitter(as.numeric(movies$rating)), col = "blue", xaxt = "n",
pch = 19)
axis(side = 1, at = unique(as.numeric(movies$rating)), labels = unique(movies$rating))
meanRatings <- tapply(movies$score, movies$rating, mean)
points(1:4, meanRatings, col = "red", pch = "-", cex = 5)
Another way to write it down
Si = b0 + b1(Rai =“ PG ”) + b2(Rai =“ PG − 13 ”) + b3(Rai =“ R ”) + ei
Average values
b0 = average of the G movies
b0 + b1 = average of the PG movies
b0 + b2 = average of the PG-13 movies
b0 + b3 = average of the R movies
Here is how you do it in R
lm1 <- lm(movies$score ~ as.factor(movies$rating))
summary(lm1)
##
## Call:
## lm(formula = movies$score ~ as.factor(movies$rating))
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.43 -9.98 -0.98 9.34 38.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.65 7.19 9.40 <2e-16 ***
## as.factor(movies$rating)PG -12.59 7.85 -1.60 0.11
## as.factor(movies$rating)PG-13 -11.81 7.41 -1.59 0.11
## as.factor(movies$rating)R -12.02 7.48 -1.61 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.4 on 136 degrees of freedom
## Multiple R-squared: 0.0199, Adjusted R-squared: -0.00177
## F-statistic: 0.918 on 3 and 136 DF, p-value: 0.434
Plot fitted values
plot(movies$score ~ jitter(as.numeric(movies$rating)), col = "blue", xaxt = "n",
pch = 19)
axis(side = 1, at = unique(as.numeric(movies$rating)), labels = unique(movies$rating))
points(1:4, lm1$coeff[1] + c(0, lm1$coeff[2:4]), col = "red", pch = "-", cex = 5)
What is the average difference in rating between G and R movies?
b0 + b3 − b0 = b3
Question 1 in R
lm1 <- lm(movies$score ~ as.factor(movies$rating))
summary(lm1)
##
## Call:
## lm(formula = movies$score ~ as.factor(movies$rating))
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.43 -9.98 -0.98 9.34 38.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.65 7.19 9.40 <2e-16 ***
## as.factor(movies$rating)PG -12.59 7.85 -1.60 0.11
## as.factor(movies$rating)PG-13 -11.81 7.41 -1.59 0.11
## as.factor(movies$rating)R -12.02 7.48 -1.61 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.4 on 136 degrees of freedom
## Multiple R-squared: 0.0199, Adjusted R-squared: -0.00177
## F-statistic: 0.918 on 3 and 136 DF, p-value: 0.434
confint(lm1)
## 2.5 % 97.5 %
## (Intercept) 53.42 81.875
## as.factor(movies$rating)PG -28.11 2.928
## as.factor(movies$rating)PG-13 -26.47 2.842
## as.factor(movies$rating)R -26.80 2.763
What is the average difference in rating between PG -13 and R movies?
b0 + b2 − (b0 + b3) = b2 − b3
We could rewrite our model
Si = b0 + b1(Rai =“ G ”) + b2(Rai =“ PG ”) + b3(Rai =“ PG − 13 ”) + ei
Average values
b0 = average of the R movies
b0 + b1 = average of the G movies
b0 + b2 = average of the PG movies
b0 + b3 = average of the PG-13 movies
What is the average difference in rating between PG -13 and R movies?
b0 + b3 − b0 = b3
Question 2 in R
lm2 <- lm(movies$score ~ relevel(movies$rating, ref = "R"))
summary(lm2)
##
## Call:
## lm(formula = movies$score ~ relevel(movies$rating, ref = "R"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.43 -9.98 -0.98 9.34 38.97
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 55.630 2.035 27.34
## relevel(movies$rating, ref = "R")G 12.020 7.476 1.61
## relevel(movies$rating, ref = "R")PG -0.573 3.741 -0.15
## relevel(movies$rating, ref = "R")PG-13 0.205 2.706 0.08
## Pr(>|t|)
## (Intercept) <2e-16 ***
## relevel(movies$rating, ref = "R")G 0.11
## relevel(movies$rating, ref = "R")PG 0.88
## relevel(movies$rating, ref = "R")PG-13 0.94
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.4 on 136 degrees of freedom
## Multiple R-squared: 0.0199, Adjusted R-squared: -0.00177
## F-statistic: 0.918 on 3 and 136 DF, p-value: 0.434
confint(lm2)
## 2.5 % 97.5 %
## (Intercept) 51.606 59.654
## relevel(movies$rating, ref = "R")G -2.763 26.803
## relevel(movies$rating, ref = "R")PG -7.971 6.825
## relevel(movies$rating, ref = "R")PG-13 -5.146 5.557
Si = b0 + b1(Rai =“ PG ”) + b2(Rai =“ PG − 13 ”) + b3(Rai =“ R ”) + ei
Is there any difference in score between any of the movie ratings?
Question 3 in R
lm1 <- lm(movies$score ~ as.factor(movies$rating))
anova(lm1)
## Analysis of Variance Table
##
## Response: movies$score
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(movies$rating) 3 570 190 0.92 0.43
## Residuals 136 28149 207
Sum of squares (G movies)
gMovies <- movies[movies$rating == "G", ]
xVals <- seq(0.2, 0.8, length = 4)
plot(xVals, gMovies$score, ylab = "Score", xaxt = "n", xlim = c(0, 1), pch = 19)
abline(h = mean(gMovies$score), col = "blue", lwd = 3)
abline(h = mean(movies$score), col = "red", lwd = 3)
segments(xVals + 0.01, rep(mean(gMovies$score), length(xVals)), xVals + 0.01,
rep(mean(movies$score), length(xVals)), col = "red", lwd = 2)
segments(xVals - 0.01, gMovies$score, xVals - 0.01, rep(mean(gMovies$score),
length(xVals)), col = "blue", lwd = 2)
lm1 <- aov(movies$score ~ as.factor(movies$rating))
TukeyHSD(lm1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = movies$score ~ as.factor(movies$rating))
##
## $`as.factor(movies$rating)`
## diff lwr upr p adj
## PG-G -12.5929 -33.008 7.822 0.3795
## PG-13-G -11.8146 -31.092 7.463 0.3854
## R-G -12.0200 -31.464 7.424 0.3776
## PG-13-PG 0.7782 -8.615 10.171 0.9964
## R-PG 0.5729 -9.158 10.304 0.9987
## R-PG-13 -0.2054 -7.245 6.834 0.9998
Key ideas
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/WHOSIS_000008.csv",
ssl.verifypeer = FALSE)
hunger <- read.csv(textConnection(myCsv))
head(hunger)
## Indicator Data.Source PUBLISH.STATES Year
## 1 Children aged <5 years underweight (%) NLIS_311900 Published 1997
## 2 Children aged <5 years underweight (%) NLIS_312819 Published 2004
## 3 Children aged <5 years underweight (%) NLIS_312819 Published 2004
## 4 Children aged <5 years underweight (%) NLIS_312819 Published 2004
## 5 Children aged <5 years underweight (%) NLIS_311954 Published 1998
## 6 Children aged <5 years underweight (%) NLIS_312361 Published 2000
## WHO.region Country Sex Display.Value Numeric Low
## 1 Eastern Mediterranean Afghanistan Both sexes 44.9 44.9 NA
## 2 Eastern Mediterranean Afghanistan Female 33.0 33.0 NA
## 3 Eastern Mediterranean Afghanistan Male 32.7 32.7 NA
## 4 Eastern Mediterranean Afghanistan Both sexes 32.9 32.9 NA
## 5 Europe Albania Both sexes 7.1 7.1 NA
## 6 Europe Albania Male 19.6 19.6 NA
## High Comments
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
Plot percent hungry versus time
lm1 <- lm(hunger$Numeric ~ hunger$Year)
plot(hunger$Year, hunger$Numeric, pch = 19, col = "blue")
Hui = b0 + b1 * Yi + ei
b0 = percent hungry at Year 0
b1 = decrease in percent hungry per year
ei = everything we didn't measure
Add the linear model
lm1 <- lm(hunger$Numeric ~ hunger$Year)
plot(hunger$Year, hunger$Numeric, pch = 19, col = "blue")
lines(hunger$Year, lm1$fitted, lwd = 3, col = "darkgrey")
Color by male/female
plot(hunger$Year, hunger$Numeric, pch = 19)
points(hunger$Year, hunger$Numeric, pch = 19, col = ((hunger$Sex == "Male") *
1 + 1))
HuFi = bf0 + bf * YFi + efi
bf0 = percent of girls hungry at Year 0
bf1 = decrease in percent of girls hungry per year
efi = everything we didn't measure
HuMi = bm0 + bm * YMi + emi
bm0 = percent of boys hungry at Year 0
bm1 = decrease in percent of boys hungry per year
emi = everything we didn't measure
Color by male/female
lmM <- lm(hunger$Numeric[hunger$Sex == "Male"] ~ hunger$Year[hunger$Sex == "Male"])
lmF <- lm(hunger$Numeric[hunger$Sex == "Female"] ~ hunger$Year[hunger$Sex ==
"Female"])
plot(hunger$Year, hunger$Numeric, pch = 19)
points(hunger$Year, hunger$Numeric, pch = 19, col = ((hunger$Sex == "Male") *
1 + 1))
lines(hunger$Year[hunger$Sex == "Male"], lmM$fitted, col = "black", lwd = 3)
lines(hunger$Year[hunger$Sex == "Female"], lmF$fitted, col = "red", lwd = 3)
H = b0 + b1 + b2 * Yi (Sex =“ Male ”) + ei
b0 - percent hungry at year zero for females b0 + b1 - percent hungry at year zero for males b2 - change in percent hungry (for either males or females) in one year ei - everything we didn't measure
Two lines, same slope in R
lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex)
plot(hunger$Year, hunger$Numeric, pch = 19)
points(hunger$Year, hunger$Numeric, pch = 19, col = ((hunger$Sex == "Male") *
1 + 1))
abline(c(lmBoth$coeff[1], lmBoth$coeff[2]), col = "red", lwd = 3)
abline(c(lmBoth$coeff[1] + lmBoth$coeff[3], lmBoth$coeff[2]), col = "black",
lwd = 3)
H = b0 + b1 (Sex =“ Male ”) + b2 * Yi + b3 (Sex =“ Male ”) + ei
b0 - percent hungry at year zero for females
b0 + b1 - percent hungry at year zero for males
b2 - change in percent hungry (females) in one year
b2 + b3 - change in percent hungry (males) in one year
ei - everything we didn't measure
Two lines, different slopes in R
lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex * hunger$Year)
plot(hunger$Year, hunger$Numeric, pch = 19)
points(hunger$Year, hunger$Numeric, pch = 19, col = ((hunger$Sex == "Male") *
1 + 1))
abline(c(lmBoth$coeff[1], lmBoth$coeff[2]), col = "red", lwd = 3)
abline(c(lmBoth$coeff[1] + lmBoth$coeff[3], lmBoth$coeff[2] + lmBoth$coeff[4]),
col = "black", lwd = 3)
summary(lmBoth)
##
## Call:
## lm(formula = hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex *
## hunger$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.09 -11.52 -2.48 7.44 47.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 365.5396 124.9619 2.93 0.0035 **
## hunger$Year -0.1743 0.0626 -2.79 0.0054 **
## hunger$SexFemale 156.9326 231.1954 0.68 0.4974
## hunger$SexMale 215.3056 231.1954 0.93 0.3519
## hunger$Year:hunger$SexFemale -0.0784 0.1156 -0.68 0.4976
## hunger$Year:hunger$SexMale -0.1067 0.1156 -0.92 0.3565
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.7 on 1533 degrees of freedom
## Multiple R-squared: 0.0176, Adjusted R-squared: 0.0144
## F-statistic: 5.5 on 5 and 1533 DF, p-value: 4.97e-05
library(UsingR)
## Warning: package 'UsingR' was built under R version 3.0.1
## Loading required package: MASS
## Attaching package: 'UsingR'
## 下列对象被屏蔽了_by_ '.GlobalEnv':
##
## movies
data(galton)
plot(galton$parent, galton$child, col = "blue", pch = 19)
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/WHOSIS_000008.csv",
ssl.verifypeer = FALSE)
hunger <- read.csv(textConnection(myCsv))
Hunger over time by region
par(mfrow = c(1, 2))
plot(hunger$Year, hunger$Numeric, col = as.numeric(hunger$WHO.region), pch = 19)
plot(1:10, type = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
legend(1, 10, col = unique(as.numeric(hunger$WHO.region)), legend = unique(hunger$WHO.region),
pch = 19)
Region correlated with year
anova(lm(hunger$Year ~ hunger$WHO.region))
## Analysis of Variance Table
##
## Response: hunger$Year
## Df Sum Sq Mean Sq F value Pr(>F)
## hunger$WHO.region 5 2558 512 8.8 3e-08 ***
## Residuals 1533 89129 58
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Region correlated with hunger
anova(lm(hunger$Numeric ~ hunger$WHO.region))
## Analysis of Variance Table
##
## Response: hunger$Numeric
## Df Sum Sq Mean Sq F value Pr(>F)
## hunger$WHO.region 5 141579 28316 285 <2e-16 ***
## Residuals 1533 152418 99
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Including region - a complicated interaction
plot(hunger$Year, hunger$Numeric, pch = 19, col = as.numeric(hunger$WHO.region))
lmRegion <- lm(hunger$Numeric ~ hunger$Year + hunger$WHO.region + hunger$Year *
hunger$WHO.region)
abline(c(lmRegion$coeff[1] + lmRegion$coeff[6], lmRegion$coeff[2] + lmRegion$coeff[12]),
col = 5, lwd = 3)
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/income.csv", ssl.verifypeer = FALSE)
incomeData <- read.csv(textConnection(myCsv), header = F)
income <- incomeData[, 3]
age <- incomeData[, 1]
Logs to address right-skew
oldpar=par(mfrow=c(1,4))
smoothScatter(age,income)
hist(income,col="blue",breaks=100)
hist(log(income+1),col="blue",breaks=100)
smoothScatter(age,log(income+1))
par(oldpar)
“outliers” … are data points that do not appear to follow the pattern of the other data points.
Example - extreme points
set.seed(1235)
xVals <- rcauchy(50)
hist(xVals, col = "blue")
Example - Outliers may be real
age <- c(age, 52)
income <- c(income, 3.78e+08)
smoothScatter(age, income) ## Add Tim Cook, CEO of Apple 2011 income
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
## Warning: Binning grid too coarse for current (small) bandwidth: consider
## increasing 'gridsize'
Example - Does not fit the trend
Outliers - what you can do?
If you know they aren't real/of interest, remove them (but changes question!)
Alternatively
Variance changes
require(RCurl)
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/bupa.data", ssl.verifypeer = FALSE)
bupaData <- read.csv(textConnection(myCsv), header = F)
ggt <- bupaData[, 5]
aat <- bupaData[, 3]
plot(log(ggt), aat, col = "blue", pch = 19)
Plot the residuals
lm1 <- lm(aat ~ log(ggt))
plot(log(ggt), lm1$residuals, col = "blue", pch = 19)