#1. set working directory
setwd("C:/Users/sharl/Desktop/USF/Fall 2021/LIS 4805 - Predictive Analytics/Week 7 - Midterm Project")
#2. download the college.csv data in your working directory. Read the data using read.csv function, and save it as data
library(readr)
data <- read_csv("College.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## X1 = col_character(),
## Private = col_character(),
## Apps = col_double(),
## Accept = col_double(),
## Enroll = col_double(),
## Top10perc = col_double(),
## Top25perc = col_double(),
## F.Undergrad = col_double(),
## P.Undergrad = col_double(),
## Outstate = col_double(),
## Room.Board = col_double(),
## Books = col_double(),
## Personal = col_double(),
## PhD = col_double(),
## Terminal = col_double(),
## S.F.Ratio = col_double(),
## perc.alumni = col_double(),
## Expend = col_double(),
## Grad.Rate = col_double()
## )
#3. print the first ten rows of the data. Explore the data.
head(data,10)
## # A tibble: 10 x 19
## X1 Private Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abil~ Yes 1660 1232 721 23 52 2885 537
## 2 Adel~ Yes 2186 1924 512 16 29 2683 1227
## 3 Adri~ Yes 1428 1097 336 22 50 1036 99
## 4 Agne~ Yes 417 349 137 60 89 510 63
## 5 Alas~ Yes 193 146 55 16 44 249 869
## 6 Albe~ Yes 587 479 158 38 62 678 41
## 7 Albe~ Yes 353 340 103 17 45 416 230
## 8 Albi~ Yes 1899 1720 489 37 68 1594 32
## 9 Albr~ Yes 1038 839 227 30 63 973 306
## 10 Alde~ Yes 582 498 172 21 44 799 78
## # ... with 10 more variables: Outstate <dbl>, Room.Board <dbl>, Books <dbl>,
## # Personal <dbl>, PhD <dbl>, Terminal <dbl>, S.F.Ratio <dbl>,
## # perc.alumni <dbl>, Expend <dbl>, Grad.Rate <dbl>
#4. require libraries: dplyr, ISLR, boot, caret
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ISLR)
## Loading required package: ISLR
require(boot)
## Loading required package: boot
require(caret)
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: ggplot2
#5. find out the details about College data. understand each variables
?College
## starting httpd help server ...
## done
#6. drop private schools, and save it as "public". how many rows does public include?
public <-data[data$Private != "Yes",]
#Public contains 212 observations
#7. drop the first two columns because they are character values and we do not need them for linear modeling
public <-public[ ,-1:-2]
#8 set seed to "1"
set.seed(1)
#9. create train and test set. First, create set object by splitting 2/3 of the data based on Grad.Rate
set <-createDataPartition(public$Grad.Rate, p= 2/3, list=F)
nrow(set)
## [1] 143
# 10. create train and test set.
train <-public[set,]
test <-public[-set,]
#####################################################################################################################################
#11. fit regression model using the train set. Use one variable as a dependent variable, and use ALL the other variables
#as independent variables throughout. In other words, do not conduct variable selection yet.
model <-lm(Grad.Rate ~., data=train)
#12. five fold cross validation because our sample size is small.
#First, create ControlParameters. set method="repeatedcv", use five repeats
ControlParameters <-trainControl(method= "repeatedcv",
number=5,
repeats=5)
#13. use train function for five fold cross validation. You must use the train dataset.
modellm <-train(Grad.Rate ~., data = train,
method="lm",
trControl= ControlParameters)
#14. print the model on the console. What is the value of RMSE?
#Save this value in a object called "RMSE_train"
RMSE_train <- modellm
RMSE_train
## Linear Regression
##
## 143 samples
## 16 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 114, 115, 115, 114, 114, 115, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 12.27555 0.4078312 9.50994
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
###Print summary of the modellm
summary(modellm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.913 -6.305 -0.334 6.374 38.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.0300381 10.8502044 4.611 9.7e-06 ***
## Apps 0.0012788 0.0007342 1.742 0.084018 .
## Accept -0.0014398 0.0014797 -0.973 0.332395
## Enroll 0.0046894 0.0036018 1.302 0.195307
## Top10perc -0.0014783 0.1296047 -0.011 0.990918
## Top25perc 0.2008193 0.1017104 1.974 0.050521 .
## F.Undergrad -0.0004705 0.0006400 -0.735 0.463646
## P.Undergrad -0.0013013 0.0004749 -2.740 0.007030 **
## Outstate 0.0019362 0.0005637 3.435 0.000803 ***
## Room.Board 0.0003553 0.0014301 0.248 0.804188
## Books -0.0141352 0.0077622 -1.821 0.070975 .
## Personal -0.0043351 0.0015223 -2.848 0.005143 **
## PhD 0.1052924 0.1512471 0.696 0.487610
## Terminal -0.2461042 0.1454937 -1.692 0.093211 .
## S.F.Ratio 0.1973321 0.3662775 0.539 0.591010
## perc.alumni 0.3196940 0.1328844 2.406 0.017590 *
## Expend -0.0003110 0.0006452 -0.482 0.630680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.91 on 126 degrees of freedom
## Multiple R-squared: 0.5198, Adjusted R-squared: 0.4588
## F-statistic: 8.525 on 16 and 126 DF, p-value: 1.16e-13
#####################################################################################################################################
#15. predict on the new data (test data)
p <- predict(modellm, test)
p
## 1 2 3 4 5 6 7 8
## 48.18019 59.60198 53.55878 53.55144 46.81897 45.12605 50.87809 62.30690
## 9 10 11 12 13 14 15 16
## 53.37525 43.17603 56.55543 38.35379 60.73397 52.42691 52.67701 77.75176
## 17 18 19 20 21 22 23 24
## 51.36309 50.02504 47.89318 64.86872 63.88854 66.77046 66.61606 51.73441
## 25 26 27 28 29 30 31 32
## 53.16560 52.31266 51.12853 66.26005 42.08696 74.13990 35.97242 73.72418
## 33 34 35 36 37 38 39 40
## 37.90832 48.35598 52.21865 54.20413 71.83962 57.62960 50.56116 54.64695
## 41 42 43 44 45 46 47 48
## 53.71841 71.64255 56.71228 82.01802 54.44534 46.21338 54.16526 60.10383
## 49 50 51 52 53 54 55 56
## 50.74132 62.49695 53.57364 94.87368 50.83840 64.38201 81.08365 44.29337
## 57 58 59 60 61 62 63 64
## 54.82068 40.21413 52.38911 46.39986 54.71972 52.96421 58.07081 52.25302
## 65 66 67 68 69
## 44.13874 43.61924 48.28823 61.12951 55.38423
#16. compute errors of the modellm for predicting on test data
error <- p - test$Grad.Rate
#17. calculate RMSE from the error you just calculated. Save this value in an object called "RMSE_test"
RMSE_test <- sqrt(mean(error^2))
RMSE_test
## [1] 12.26936
#18. compare RMSE_train and RMSE_test
RMSE_train
## Linear Regression
##
## 143 samples
## 16 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 114, 115, 115, 114, 114, 115, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 12.27555 0.4078312 9.50994
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
RMSE_test
## [1] 12.26936
##########################################################################################################
# 19. Predict on full "public" dataset. Is the full data RMSE lower than the test set result?
p_full <- predict(modellm, public)
error_full <- p_full - public$Grad.Rate
# 20. calculate RMSE from the full data and same the value as "RMSE_full"
RMSE_full <- sqrt(mean(error_full^2))
#21. Compare RMSE's from full data, train data, and test data.
RMSE_train
## Linear Regression
##
## 143 samples
## 16 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 114, 115, 115, 114, 114, 115, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 12.27555 0.4078312 9.50994
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
RMSE_test
## [1] 12.26936
RMSE_full
## [1] 10.94097