climate=read.csv("C:\\Users\\Administrator\\Documents\\big data\\Unit2\\Unit2\\climate_change.csv")
training_set1 = climate[climate$Year<=2006,]
training_set2 = climate[climate$Year>2006,]
【1.1】 Enter the model R2
summary(lm(Temp~MEI+CO2+CH4+N2O+CFC.11+CFC.12+TSI+Aerosols,training_set1))$r.square
## [1] 0.7508933
【1.2】 Which variables are significant in the model? We will consider a variable signficant only if the p-value is below 0.05.
names(which(summary(lm(Temp~MEI+CO2+CH4+N2O+CFC.11+CFC.12+TSI+Aerosols,training_set1))$coefficients[-1,4]<0.05))
## [1] "MEI" "CO2" "CFC.11" "CFC.12" "TSI" "Aerosols"
【2.1】 Which of the following is the simplest correct explanation for this contradiction?
cor(training_set1[3:10])
## MEI CO2 CH4 N2O CFC.11
## MEI 1.000000000 -0.04114717 -0.0334193 -0.05081978 0.06900044
## CO2 -0.041147165 1.00000000 0.8772796 0.97671982 0.51405975
## CH4 -0.033419301 0.87727963 1.0000000 0.89983864 0.77990402
## N2O -0.050819775 0.97671982 0.8998386 1.00000000 0.52247732
## CFC.11 0.069000439 0.51405975 0.7799040 0.52247732 1.00000000
## CFC.12 0.008285544 0.85268963 0.9636162 0.86793078 0.86898518
## TSI -0.154491923 0.17742893 0.2455284 0.19975668 0.27204596
## Aerosols 0.340237787 -0.35615480 -0.2678092 -0.33705457 -0.04392120
## CFC.12 TSI Aerosols
## MEI 0.008285544 -0.15449192 0.34023779
## CO2 0.852689627 0.17742893 -0.35615480
## CH4 0.963616248 0.24552844 -0.26780919
## N2O 0.867930776 0.19975668 -0.33705457
## CFC.11 0.868985183 0.27204596 -0.04392120
## CFC.12 1.000000000 0.25530281 -0.22513124
## TSI 0.255302814 1.00000000 0.05211651
## Aerosols -0.225131244 0.05211651 1.00000000
【2.2.1】 Compute the correlations between all the variables in the training set. Which of the following independent variables is N2O highly correlated with (absolute correlation greater than 0.7)?
rownames(as.data.frame(cor(training_set1[3:10])))[which(abs(as.data.frame(as.data.frame(cor(training_set1[3:10]))$N2O))>=0.7 & abs(as.data.frame(cor(training_set1[3:10]))$N2O)<1)]
## [1] "CO2" "CH4" "CFC.12"
【2.2.2】 Which of the following independent variables is CFC.11 highly correlated with?
rownames(as.data.frame(cor(training_set1[3:10])))[which(abs(as.data.frame(as.data.frame(cor(training_set1[3:10]))$CFC.11))>=0.7 & abs(as.data.frame(cor(training_set1[3:10]))$CFC.11)<1)]
## [1] "CH4" "CFC.12"
【3.1.1】 Enter the coefficient of N2O in this reduced model:
lm(Temp~MEI+TSI+Aerosols+N2O,training_set1)$coefficients[5]
## N2O
## 0.02531975
【3.1.2】 Enter the model R2:
summary(lm(Temp~MEI+TSI+Aerosols+N2O,training_set1))$r.square
## [1] 0.7261321
【4.1.1】 Enter the R2 value of the model produced by the step function:
m1 = lm(Temp~MEI+CO2+CH4+N2O+CFC.11+CFC.12+TSI+Aerosols,training_set1)
stepmodel=step(m1)
## Start: AIC=-1348.16
## Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
##
## Df Sum of Sq RSS AIC
## - CH4 1 0.00049 2.3135 -1350.1
## <none> 2.3130 -1348.2
## - N2O 1 0.03132 2.3443 -1346.3
## - CO2 1 0.06719 2.3802 -1342.0
## - CFC.12 1 0.11874 2.4318 -1335.9
## - CFC.11 1 0.13986 2.4529 -1333.5
## - TSI 1 0.33516 2.6482 -1311.7
## - Aerosols 1 0.43727 2.7503 -1301.0
## - MEI 1 0.82823 3.1412 -1263.2
##
## Step: AIC=-1350.1
## Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
##
## Df Sum of Sq RSS AIC
## <none> 2.3135 -1350.1
## - N2O 1 0.03133 2.3448 -1348.3
## - CO2 1 0.06672 2.3802 -1344.0
## - CFC.12 1 0.13023 2.4437 -1336.5
## - CFC.11 1 0.13938 2.4529 -1335.5
## - TSI 1 0.33500 2.6485 -1313.7
## - Aerosols 1 0.43987 2.7534 -1302.7
## - MEI 1 0.83118 3.1447 -1264.9
summary(stepmodel)$r.square
## [1] 0.7508409
【4.1.2】 Which of the following variable(s) were eliminated from the full model by the step function?
setdiff(names(m1$coefficients),names(stepmodel$coefficients))
## [1] "CH4"
【5.1】Testing on Unseen Data Enter the testing set R2:
pred=predict(stepmodel,training_set2)
SSE = sum((pred - training_set2$Temp)^2)
SST = sum((mean(training_set1$Temp) - training_set2$Temp)^2)
R2 = 1 - SSE/SST
R2
## [1] 0.6286051
test=read.csv("C:\\Users\\Administrator\\Documents\\big data\\Unit2\\Unit2\\pisa2009test.csv")
train=read.csv("C:\\Users\\Administrator\\Documents\\big data\\Unit2\\Unit2\\pisa2009train.csv")
【1.1】Dataset size How many students are there in the training set?
nrow(train)
## [1] 3663
【1.2.1】 Using tapply() on pisaTrain, what is the average reading test score of males?
tapply(train$readingScore,train$male,mean)[2]
## 1
## 483.5325
【1.2.2】 Of females?
tapply(train$readingScore,train$male,mean)[1]
## 0
## 512.9406
【1.3】 Which variables are missing data in at least one observation in the training set?
names(which(apply(train,2,function(x){sum(is.na(x))})>0))
## [1] "raceeth" "preschool"
## [3] "expectBachelors" "motherHS"
## [5] "motherBachelors" "motherWork"
## [7] "fatherHS" "fatherBachelors"
## [9] "fatherWork" "selfBornUS"
## [11] "motherBornUS" "fatherBornUS"
## [13] "englishAtHome" "computerForSchoolwork"
## [15] "read30MinsADay" "minutesPerWeekEnglish"
## [17] "studentsInEnglish" "schoolHasLibrary"
## [19] "schoolSize"
【1.4.1】 How many observations are now in the training set?
train=na.omit(train)
nrow(train)
## [1] 2414
【1.4.2】 How many observations are now in the testing set?
test=na.omit(test)
nrow(test)
## [1] 990
【2.1.1】 Which of the following variables is an unordered factor with at least 3 levels?
table(train$grade) #ordered, 5-level
##
## 8 9 10 11 12
## 2 188 1730 491 3
table(train$male) #unordered, 2-level
##
## 0 1
## 1204 1210
table(train$raceeth) #unordered, 7-level***
##
## American Indian/Alaska Native
## 20
## Asian
## 95
## Black
## 228
## Hispanic
## 500
## More than one race
## 81
## Native Hawaiian/Other Pacific Islander
## 20
## White
## 1470
【2.1.2】 Which of the following variables is an ordered factor with at least 3 levels?
table(train$grade) #ordered, 5-level***
##
## 8 9 10 11 12
## 2 188 1730 491 3
table(train$male) #unordered, 2-level
##
## 0 1
## 1204 1210
table(train$raceeth) #unordered, 7-level
##
## American Indian/Alaska Native
## 20
## Asian
## 95
## Black
## 228
## Hispanic
## 500
## More than one race
## 81
## Native Hawaiian/Other Pacific Islander
## 20
## White
## 1470
【2.2】 Which binary variables will be included in the regression model?
names(table(train$raceeth))
## [1] "American Indian/Alaska Native"
## [2] "Asian"
## [3] "Black"
## [4] "Hispanic"
## [5] "More than one race"
## [6] "Native Hawaiian/Other Pacific Islander"
## [7] "White"
【2.3.1】 For a student who is Asian, which binary variables would be set to 0?
names(table(train$raceeth))[-c(2,7)]
## [1] "American Indian/Alaska Native"
## [2] "Black"
## [3] "Hispanic"
## [4] "More than one race"
## [5] "Native Hawaiian/Other Pacific Islander"
【2.3.1】 For a student who is white, which binary variables would be set to 0?
names(table(train$raceeth))[-7]
## [1] "American Indian/Alaska Native"
## [2] "Asian"
## [3] "Black"
## [4] "Hispanic"
## [5] "More than one race"
## [6] "Native Hawaiian/Other Pacific Islander"
【3.1】 What is the Multiple R-squared value of lmScore on the training set?
train$raceeth = relevel(train$raceeth, "White")
test$raceeth = relevel(test$raceeth, "White")
lmScore = lm(readingScore~., train)
summary(lmScore)[8]
## $r.squared
## [1] 0.3251434
【3.2】 What is the training-set root-mean squared error (RMSE) of lmScore?
SSE = sum(lmScore$residuals^2)
RMSE = sqrt(SSE / nrow(train))
sqrt(mean(lmScore$residuals^2))
## [1] 73.36555
【3.3】 What is the predicted reading score of student A minus the predicted reading score of student B?
lmScore$coefficients[2]*2
## grade
## 59.08541
【3.4】 What is the meaning of the coefficient associated with variable raceethAsian?
# Predicted difference in the reading score between an Asian student and a white student who is otherwise identical
【3.5】 Based on the significance codes, which variables are candidates for removal from the model?
names(which(abs(summary(lmScore)$coefficients[-1,4])>0.05))[-c(1,2)]
## [1] "preschool" "motherHS"
## [3] "motherWork" "fatherHS"
## [5] "fatherWork" "selfBornUS"
## [7] "motherBornUS" "fatherBornUS"
## [9] "englishAtHome" "minutesPerWeekEnglish"
## [11] "studentsInEnglish" "schoolHasLibrary"
## [13] "urban"
【4.1】 What is the range between the maximum and minimum predicted reading score on the test set?
predTest = predict(lmScore, newdata=test)
range(predTest)[2]-range(predTest)[1]
## [1] 284.4683
【4.2.1】 What is the sum of squared errors (SSE) of lmScore on the testing set?
sum((predTest-test$readingScore)^2)
## [1] 5762082
【4.2.2】 What is the root-mean squared error (RMSE) of lmScore on the testing set?
sqrt(mean((predTest-test$readingScore)^2))
## [1] 76.29079
【4.3.1】 What is the predicted test score used in the baseline model?
baseline = mean(train$readingScore);baseline
## [1] 517.9629
【4.3.2】 What is the sum of squared errors of the baseline model on the testing set?
sum((baseline-test$readingScore)^2)
## [1] 7802354
【4.4】Test-set R-squared What is the test-set R-squared value of lmScore?
1-(sum((predTest-test$readingScore)^2)/sum((baseline-test$readingScore)^2))
## [1] 0.2614944
flutrain=read.csv("C:\\Users\\Administrator\\Documents\\big data\\Unit2\\Unit2\\FluTrain.csv")
flutest=read.csv("C:\\Users\\Administrator\\Documents\\big data\\Unit2\\Unit2\\FluTest.csv")
【1.1.1】 Looking at the time period 2004-2011, which week corresponds to the highest percentage of ILI-related physician visits?
as.character(flutrain[which.max(flutrain$ILI),][,1])
## [1] "2009-10-18 - 2009-10-24"
【1.1.2】 Which week corresponds to the highest percentage of ILI-related query fraction?
as.character(flutrain[which.max(flutrain$Queries),][,1])
## [1] "2009-10-18 - 2009-10-24"
【1.2】 What best describes the distribution of values of ILI?
hist(flutrain$ILI,20,xlab="percentage",main="distribution")
#right skewness
【1.3】 Plot the natural logarithm of ILI versus Queries. What does the plot suggest?
plot(flutrain$Queries,log(flutrain$ILI),xlab="Queries",ylab="ILI",main="scatter",col="blue") #positive
【2.1】 Based on our understanding of the data from the previous subproblem, which model best describes our estimation problem?
plot(flutrain$Queries,log(flutrain$ILI),xlab="Queries",ylab="ILI",main="scatter",col="blue")
# log(ILI) = b0 + b1*Queries
【2.2】 What is the training set R-squared value for FluTrend1 model (the “Multiple R-squared”)?
FluTrend1=lm(log(ILI)~Queries,flutrain)
summary(FluTrend1)$r.square
## [1] 0.7090201
【2.3】 What is the relationship we infer from our problem?
R_squared=cor(log(flutrain$ILI),flutrain$Queries)^2;R_squared
## [1] 0.7090201
#R-squared = Correlation^2
【3.1】 What is our estimate for the percentage of ILI-related physician visits for the week of March 11, 2012?
PredTest1 = exp(predict(FluTrend1, flutest))
as.numeric(PredTest1[which(flutest$Week == "2012-03-11 - 2012-03-17")])
## [1] 2.187378
【3.2】 What is the relative error betweeen the estimate (our prediction) and the observed value for the week of March 11, 2012?
as.numeric((flutest$ILI[11]-PredTest1[11])/flutest$ILI[11])
## [1] 0.04623827
【3.3】 What is the Root Mean Square Error (RMSE) between our estimates and the actual observations for the percentage of ILI-related physician visits, on the test set?
SSE = sum((PredTest1-flutest$ILI)^2)
RMSE = sqrt(SSE / nrow(flutest))
sqrt(mean((PredTest1-flutest$ILI)^2))
## [1] 0.7490645
輸入“zoo”套件
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
【4.1】 How many values are missing in the new ILILag2 variable?
flutrain$ILILag2=with(flutrain,lag(zoo(flutrain$ILI), -2, na.pad=TRUE))
sum(is.na(flutrain$ILILag2))
## [1] 2
【4.2】 Use the plot() function to plot the log of ILILag2 against the log of ILI. Which best describes the relationship between these two variables?
plot(log(flutrain$ILILag2),log(flutrain$ILI),xlab="ILILag2",ylab="ILI",main="scatter",col="blue")
【4.3.1】 Which coefficients are significant at the p=0.05 level in this regression model?
names(which(summary(lm(log(ILI)~Queries + log(ILILag2),flutrain))$coefficients[,4]<0.05))
## [1] "(Intercept)" "Queries" "log(ILILag2)"
【4.3.2】 What is the R^2 value of the FluTrend2 model?
as.numeric(summary(lm(log(ILI)~Queries + log(ILILag2),flutrain))$r.square)
## [1] 0.9063414
【4.4】 On the basis of R-squared value and significance of coefficients, which statement is the most accurate?
# FluTrend2 is stronger
【5.1】 How many missing values are there in this new variable?
flutest$ILILag2=with(flutest,lag(zoo(flutest$ILI), -2, na.pad=TRUE))
sum(is.na(flutest$ILILag2))
## [1] 2
【5.2.1】 Which value should be used to fill in the ILILag2 variable for the first observation in FluTest?
#The ILI value of the second-to-last observation in the FluTrain data frame.
【5.2.2】 Which value should be used to fill in the ILILag2 variable for the second observation in FluTest?
#The ILI value of the last observation in the FluTrain data frame
【5.3.1】 What is the new value of the ILILag2 variable in the first row of FluTest?
flutest$ILILag2[1:2] = flutrain$ILI[416:417]
as.numeric(flutest$ILILag2[1])
## [1] 1.852736
【5.3.2】 What is the new value of the ILILag2 variable in the second row of FluTest?
as.numeric(flutest$ILILag2[2])
## [1] 2.12413
【5.4】 What is the test-set RMSE of the FluTrend2 model?
FluTrend2=lm(log(ILI)~Queries + log(ILILag2),flutrain)
PredTest2 = exp(predict(FluTrend2, flutest))
SSE = sum((PredTest2-flutest$ILI)^2)
RMSE = sqrt(SSE / nrow(flutest))
sqrt(mean((PredTest2-flutest$ILI)^2))
## [1] 0.2942029
【5.5】 Which model obtained the best test-set RMSE?
#FluTrend2
#The test-set RMSE of FluTrend2 is 0.294, as opposed to the 0.749 value obtained by the FluTrend1 model.