You should complete the following problems in R. Be sure to provide the instructor with both your code and output.
homes dataset into R. This dataset contains
information about houses sold in King County, Washington in 2014.homes <- read.csv("/Users/ryannlaky/Documents/University of Indianapolis/MSDA 622/Homework/Homework 5/homes.csv")
str(homes)
## 'data.frame': 76 obs. of 18 variables:
## $ Price : num 388 450 386 350 156 ...
## $ Floor : num 2.18 2.05 2.11 1.44 1.8 ...
## $ Lot : int 4 5 5 6 1 5 4 4 4 5 ...
## $ Bath : num 3 3 2 1 2 2 1.1 2 2.1 2.1 ...
## $ Bed : int 4 4 4 2 4 3 4 4 4 3 ...
## $ BathBed: num 12 12 8 2 8 6 4.4 8 8.4 6.3 ...
## $ Year : int 1940 1957 1955 1956 1994 1940 1958 1961 1965 1968 ...
## $ Age : num -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
## $ AgeSq : num 9 1.69 2.25 1.96 5.76 9 1.44 0.81 0.25 0.04 ...
## $ Gar : int 0 2 2 1 1 1 1 2 2 2 ...
## $ Status : chr "Sold" "Sold" "Sold" "Active" ...
## $ DAc : int 0 0 0 1 0 0 1 0 1 0 ...
## $ School : chr "Edison" "Edison" "Edison" "Adams" ...
## $ DEd : int 1 1 1 0 0 0 0 0 0 0 ...
## $ DHa : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DAd : int 0 0 0 1 1 1 0 0 0 0 ...
## $ DCr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DPa : int 0 0 0 0 0 0 1 1 1 1 ...
homes <- homes[,-c(3,5,7,10:18)]
str(homes)
## 'data.frame': 76 obs. of 6 variables:
## $ Price : num 388 450 386 350 156 ...
## $ Floor : num 2.18 2.05 2.11 1.44 1.8 ...
## $ Bath : num 3 3 2 1 2 2 1.1 2 2.1 2.1 ...
## $ BathBed: num 12 12 8 2 8 6 4.4 8 8.4 6.3 ...
## $ Age : num -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
## $ AgeSq : num 9 1.69 2.25 1.96 5.76 9 1.44 0.81 0.25 0.04 ...
maxs <- apply(homes, 2, max) # maximum value of each column
mins <- apply(homes, 2, min) #minimum value of each column
scaled <- as.data.frame(scale(homes, center = mins, scale = maxs - mins))
summary(scaled)
## Price Floor Bath BathBed
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2963 1st Qu.:0.2890 1st Qu.:0.4762 1st Qu.:0.3077
## Median :0.4092 Median :0.3616 Median :0.4762 Median :0.3308
## Mean :0.4424 Mean :0.3643 Mean :0.5752 Mean :0.4363
## 3rd Qu.:0.6154 3rd Qu.:0.4584 3rd Qu.:0.9524 3rd Qu.:0.5385
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Age AgeSq
## Min. :0.0000 Min. :0.000000
## 1st Qu.:0.5275 1st Qu.:0.005917
## Median :0.6450 Median :0.028876
## Mean :0.6441 Mean :0.128991
## 3rd Qu.:0.7500 3rd Qu.:0.213018
## Max. :1.0000 Max. :1.000000
index <- sample(nrow(homes), nrow(homes)*0.85)
train_homes <- scaled[index,]
test_homes <- scaled[-index,]
#install.packages("neuralnet")
library(neuralnet)
nn <- neuralnet(train_homes$Price ~ ., data=train_homes, hidden=c(2,3,2), linear.output=T)
nn$act.fct #activation function
## function (x)
## {
## 1/(1 + exp(-x))
## }
## <bytecode: 0x1207319f0>
## <environment: 0x12074f208>
## attr(,"type")
## [1] "logistic"
#install.packages('devtools')
library(devtools)
source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
plot.nnet(nn)
pr_nn <- compute(nn, test_homes[,2:6]) #function used for predictions on a neural network
pr_nn_org <- pr_nn$net.result*(max(homes$Price) - min(homes$Price)) + min(homes$Price)
pr_nn_org #provides the final, unstandardized predictions
## [,1]
## 14 303.7929
## 18 225.3665
## 20 218.5979
## 22 200.0108
## 23 260.3561
## 32 230.2863
## 36 274.5236
## 40 319.1962
## 46 271.3713
## 58 319.8534
## 70 270.7519
## 75 237.7602
test_r <- (test_homes$Price)*(max(homes$Price) - min(homes$Price)) + min(homes$Price) #convert back to scale
MSPE_nn <- sum((test_r - pr_nn_org)^2)/nrow(test_homes)
MSPE_nn
## [1] 1695.506
train and test
into R. Use the train dataset as your training set, and use
the test dataset as your testing set.train <- read.csv("/Users/ryannlaky/Documents/University of Indianapolis/MSDA 622/Homework/Homework 5/train.csv")
str(train)
## 'data.frame': 480 obs. of 14 variables:
## $ crim : num 4.2224 6.2881 11.8123 0.5578 0.0616 ...
## $ zn : num 0 0 0 0 0 0 0 0 0 40 ...
## $ indus : num 18.1 18.1 18.1 21.89 4.39 ...
## $ chas : int 1 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.77 0.74 0.718 0.624 0.442 0.437 0.631 0.499 0.437 0.429 ...
## $ rm : num 5.8 6.34 6.82 6.33 5.9 ...
## $ age : num 89 96.4 76.5 98.2 52.3 18.4 100 68.2 45.8 34.5 ...
## $ dis : num 1.9 2.07 1.79 2.11 8.01 ...
## $ rad : int 24 24 24 4 3 4 24 5 5 1 ...
## $ tax : int 666 666 666 437 352 289 666 279 398 335 ...
## $ ptratio: num 20.2 20.2 20.2 21.2 18.8 16 20.2 19.2 18.7 19.7 ...
## $ black : num 353 318 48.5 394.7 364.6 ...
## $ lstat : num 14.6 17.8 22.7 17 12.7 ...
## $ medv : num 16.8 14.9 8.4 18.1 17.2 23.9 50 18.9 20.8 26.6 ...
test <- read.csv("/Users/ryannlaky/Documents/University of Indianapolis/MSDA 622/Homework/Homework 5/test.csv")
str(test)
## 'data.frame': 26 obs. of 14 variables:
## $ crim : num 0.0324 0.7503 0.0519 0.169 0.59 ...
## $ zn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ indus : num 2.18 8.14 4.49 25.65 21.89 ...
## $ chas : int 0 0 0 0 0 0 1 0 0 0 ...
## $ nox : num 0.458 0.538 0.449 0.581 0.624 0.624 0.871 0.605 0.605 0.51 ...
## $ rm : num 7 5.92 6.01 5.99 6.37 ...
## $ age : num 45.8 94.1 45.1 88.4 97.9 93.5 88 97.4 100 84.1 ...
## $ dis : num 6.06 4.4 4.43 1.99 2.33 ...
## $ rad : int 3 4 3 2 4 4 5 5 5 5 ...
## $ tax : int 222 307 247 188 437 437 403 403 403 296 ...
## $ ptratio: num 18.7 21 18.5 19.1 21.2 21.2 14.7 14.7 14.7 16.6 ...
## $ black : num 395 394 396 385 386 ...
## $ lstat : num 2.94 16.3 12.86 14.81 11.12 ...
## $ medv : num 33.4 15.6 22.5 21.4 23 17.4 15.3 41.3 24.3 23.6 ...
#install.packages("glmnet")
library(glmnet)
lreg <- glmnet(x = as.matrix(train[, -14]), y = train[, 14], alpha = 1)
lambda_info <- cv.glmnet(x = as.matrix(train[, -14]), y = train[, 14], alpha = 1)
min_lambda <- lambda_info$lambda.min
min_lambda
## [1] 0.03043668
y_predicted <- predict(lreg, s = min_lambda, newx = as.matrix(test[,-14]))
print(paste0("MSE: ", mean((test[,14] - y_predicted)^2)))
## [1] "MSE: 24.0933577028936"
coef(lreg, s = min_lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 34.184228273
## crim -0.098203107
## zn 0.042147254
## indus .
## chas 2.859623514
## nox -15.386141974
## rm 3.776202467
## age .
## dis -1.377510667
## rad 0.246784455
## tax -0.009494388
## ptratio -0.904718617
## black 0.008943527
## lstat -0.547515273
* According to the above, the variables `indus` and `age` were considered insignificant in this LASSO regression.