library(readr)
homes <- read_csv("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Homework 5/homes.csv")
## Rows: 76 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Status, School
## dbl (16): Price, Floor, Lot, Bath, Bed, BathBed, Year, Age, AgeSq, Gar, DAc,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(homes)
## spc_tbl_ [76 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Price : num [1:76] 388 450 386 350 156 ...
## $ Floor : num [1:76] 2.18 2.05 2.11 1.44 1.8 ...
## $ Lot : num [1:76] 4 5 5 6 1 5 4 4 4 5 ...
## $ Bath : num [1:76] 3 3 2 1 2 2 1.1 2 2.1 2.1 ...
## $ Bed : num [1:76] 4 4 4 2 4 3 4 4 4 3 ...
## $ BathBed: num [1:76] 12 12 8 2 8 6 4.4 8 8.4 6.3 ...
## $ Year : num [1:76] 1940 1957 1955 1956 1994 ...
## $ Age : num [1:76] -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
## $ AgeSq : num [1:76] 9 1.69 2.25 1.96 5.76 9 1.44 0.81 0.25 0.04 ...
## $ Gar : num [1:76] 0 2 2 1 1 1 1 2 2 2 ...
## $ Status : chr [1:76] "Sold" "Sold" "Sold" "Active" ...
## $ DAc : num [1:76] 0 0 0 1 0 0 1 0 1 0 ...
## $ School : chr [1:76] "Edison" "Edison" "Edison" "Adams" ...
## $ DEd : num [1:76] 1 1 1 0 0 0 0 0 0 0 ...
## $ DHa : num [1:76] 0 0 0 0 0 0 0 0 0 0 ...
## $ DAd : num [1:76] 0 0 0 1 1 1 0 0 0 0 ...
## $ DCr : num [1:76] 0 0 0 0 0 0 0 0 0 0 ...
## $ DPa : num [1:76] 0 0 0 0 0 0 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Price = col_double(),
## .. Floor = col_double(),
## .. Lot = col_double(),
## .. Bath = col_double(),
## .. Bed = col_double(),
## .. BathBed = col_double(),
## .. Year = col_double(),
## .. Age = col_double(),
## .. AgeSq = col_double(),
## .. Gar = col_double(),
## .. Status = col_character(),
## .. DAc = col_double(),
## .. School = col_character(),
## .. DEd = col_double(),
## .. DHa = col_double(),
## .. DAd = col_double(),
## .. DCr = col_double(),
## .. DPa = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
homes_numeric <- homes %>%
select_if(is.numeric)
str(homes_numeric)
## tibble [76 × 16] (S3: tbl_df/tbl/data.frame)
## $ Price : num [1:76] 388 450 386 350 156 ...
## $ Floor : num [1:76] 2.18 2.05 2.11 1.44 1.8 ...
## $ Lot : num [1:76] 4 5 5 6 1 5 4 4 4 5 ...
## $ Bath : num [1:76] 3 3 2 1 2 2 1.1 2 2.1 2.1 ...
## $ Bed : num [1:76] 4 4 4 2 4 3 4 4 4 3 ...
## $ BathBed: num [1:76] 12 12 8 2 8 6 4.4 8 8.4 6.3 ...
## $ Year : num [1:76] 1940 1957 1955 1956 1994 ...
## $ Age : num [1:76] -3 -1.3 -1.5 -1.4 2.4 -3 -1.2 -0.9 -0.5 -0.2 ...
## $ AgeSq : num [1:76] 9 1.69 2.25 1.96 5.76 9 1.44 0.81 0.25 0.04 ...
## $ Gar : num [1:76] 0 2 2 1 1 1 1 2 2 2 ...
## $ DAc : num [1:76] 0 0 0 1 0 0 1 0 1 0 ...
## $ DEd : num [1:76] 1 1 1 0 0 0 0 0 0 0 ...
## $ DHa : num [1:76] 0 0 0 0 0 0 0 0 0 0 ...
## $ DAd : num [1:76] 0 0 0 1 1 1 0 0 0 0 ...
## $ DCr : num [1:76] 0 0 0 0 0 0 0 0 0 0 ...
## $ DPa : num [1:76] 0 0 0 0 0 0 1 1 1 1 ...
homes_normalized <- as.data.frame(scale(homes_numeric))
set.seed(123)
index <- sample(1:nrow(homes_normalized), round(0.85*nrow(homes_normalized)), replace = FALSE)
homes_train <- homes_normalized[index, ]
homes_test <- homes_normalized[-index, ]
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.2.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
homes_nn <- neuralnet(homes_train$Price~., data = homes_train, hidden = c(2,3,2), linear.output=T)
homes_nn$act.fct
## function (x)
## {
## 1/(1 + exp(-x))
## }
## <bytecode: 0x00000257393204d0>
## <environment: 0x0000025739327b18>
## attr(,"type")
## [1] "logistic"
plot(homes_nn)
pr_nn <- compute(homes_nn, homes_test[,2:16])
pr_nn #The final, unstandardized prediction.
## $neurons
## $neurons[[1]]
## Floor Lot Bath Bed BathBed Year
## 3 1 0.6666288 0.612836710 -0.3645200 0.7493631 0.1185069 -0.6132974
## 4 1 -2.4875003 1.217714501 -2.1179074 -1.9626175 -2.0517398 -0.5707306
## 24 1 -0.4820391 0.612836710 -0.3645200 0.7493631 0.1185069 -0.1450630
## 33 1 -0.8021597 -1.201796664 -0.1891813 -0.6066272 -0.4963963 1.3022067
## 40 1 0.7466589 0.612836710 -0.3645200 -0.6066272 -0.6049087 1.4299070
## 46 1 -0.3031482 -0.596918873 -0.1891813 -0.6066272 -0.4963963 0.2380378
## 47 1 0.3653388 0.612836710 -0.3645200 -1.9626175 -1.3283242 1.0042394
## 58 1 0.5159838 0.007958918 -0.1891813 -0.6066272 -0.4963963 1.5150405
## 59 1 0.7325360 -1.806674455 1.5642061 -0.6066272 0.5887270 1.5150405
## 60 1 0.7325360 -1.806674455 1.5642061 -0.6066272 0.5887270 1.5150405
## 61 1 0.7325360 -1.806674455 1.5642061 -0.6066272 0.5887270 1.5150405
## Age AgeSq Gar DAc DEd DHa DAd
## 3 -0.6132974 -0.3899166 0.5626235 -0.6955186 2.2941573 -0.4720544 -0.2013831
## 4 -0.5707306 -0.4252542 -0.7331155 1.4188580 -0.4301545 -0.4720544 4.9003222
## 24 -0.1450630 -0.6445914 0.5626235 -0.6955186 -0.4301545 -0.4720544 -0.2013831
## 33 1.3022067 0.4325975 -0.7331155 -0.6955186 -0.4301545 2.0905264 -0.2013831
## 40 1.4299070 0.6629015 0.5626235 -0.6955186 -0.4301545 -0.4720544 -0.2013831
## 46 0.2380378 -0.6336245 -2.0288545 -0.6955186 -0.4301545 -0.4720544 -0.2013831
## 47 1.0042394 -0.0194806 1.8583625 -0.6955186 -0.4301545 -0.4720544 -0.2013831
## 58 1.5150405 0.8286229 0.5626235 1.4188580 -0.4301545 2.0905264 -0.2013831
## 59 1.5150405 0.8286229 0.5626235 1.4188580 -0.4301545 2.0905264 -0.2013831
## 60 1.5150405 0.8286229 0.5626235 -0.6955186 -0.4301545 2.0905264 -0.2013831
## 61 1.5150405 0.8286229 0.5626235 1.4188580 -0.4301545 2.0905264 -0.2013831
## DCr DPa
## 3 -0.2908375 -0.4926115
## 4 -0.2908375 -0.4926115
## 24 -0.2908375 -0.4926115
## 33 -0.2908375 -0.4926115
## 40 -0.2908375 2.0032868
## 46 -0.2908375 -0.4926115
## 47 -0.2908375 -0.4926115
## 58 -0.2908375 -0.4926115
## 59 -0.2908375 -0.4926115
## 60 -0.2908375 -0.4926115
## 61 -0.2908375 -0.4926115
##
## $neurons[[2]]
## [,1] [,2] [,3]
## 3 1 3.283940e-07 0.008352536
## 4 1 9.991069e-01 0.999935850
## 24 1 9.783269e-01 0.999604411
## 33 1 9.455412e-01 0.989454522
## 40 1 2.646445e-01 0.923685530
## 46 1 4.686598e-06 0.999867192
## 47 1 9.735337e-01 0.056450497
## 58 1 9.999900e-01 0.002695886
## 59 1 9.753414e-01 0.002898065
## 60 1 3.574886e-02 0.019861042
## 61 1 9.753414e-01 0.002898065
##
## $neurons[[3]]
## [,1] [,2] [,3] [,4]
## 3 1 0.70661360 0.6466978 0.33921758
## 4 1 0.20428097 0.2623677 0.70261000
## 24 1 0.20112851 0.2575932 0.70908421
## 33 1 0.20121981 0.2554539 0.71305807
## 40 1 0.13716093 0.1514111 0.85343017
## 46 1 0.08712336 0.0933858 0.91745374
## 47 1 0.84380436 0.8419388 0.11643013
## 58 1 0.86853185 0.8654777 0.09670357
## 59 1 0.86564200 0.8618077 0.10014938
## 60 1 0.70618705 0.6491956 0.33481113
## 61 1 0.86564200 0.8618077 0.10014938
##
## $neurons[[4]]
## [,1] [,2] [,3]
## 3 1 0.99833791 5.927695e-04
## 4 1 0.47422328 1.725041e-01
## 24 1 0.45544079 1.791884e-01
## 33 1 0.44757163 1.803493e-01
## 40 1 0.13791335 3.646439e-01
## 46 1 0.06065411 5.235147e-01
## 47 1 0.99991264 8.924727e-05
## 58 1 0.99993912 6.639947e-05
## 59 1 0.99993577 6.892556e-05
## 60 1 0.99839454 5.894222e-04
## 61 1 0.99993577 6.892556e-05
##
##
## $net.result
## [,1]
## 3 0.9769251
## 4 -0.1111201
## 24 -0.1506286
## 33 -0.1655634
## 40 -0.8900278
## 46 -1.1821663
## 47 0.9801813
## 58 0.9802503
## 59 0.9802419
## 60 0.9770276
## 61 0.9802419
pr_nn_org <- pr_nn$net.result*(max(homes$Price) - min(homes$Price)) + min(homes$Price)
pr_nn_org
## [,1]
## 3 443.2044
## 4 122.7751
## 24 111.1399
## 33 106.7416
## 40 -106.6132
## 46 -192.6480
## 47 444.1634
## 58 444.1837
## 59 444.1812
## 60 443.2346
## 61 444.1812
test_r <- (homes_test$Price)*(max(homes$Price) - min(homes$Price)) + min(homes$Price)
MSPE_nn <- sum((test_r - pr_nn_org)^2)/nrow(homes_test)
MSPE_nn
## [1] 62591.92
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Loaded glmnet 4.1-7
train <- read_csv("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Homework 5/train.csv")
## Rows: 480 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(train)
## spc_tbl_ [480 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ crim : num [1:480] 4.2224 6.2881 11.8123 0.5578 0.0616 ...
## $ zn : num [1:480] 0 0 0 0 0 0 0 0 0 40 ...
## $ indus : num [1:480] 18.1 18.1 18.1 21.89 4.39 ...
## $ chas : num [1:480] 1 0 0 0 0 0 0 0 0 0 ...
## $ nox : num [1:480] 0.77 0.74 0.718 0.624 0.442 0.437 0.631 0.499 0.437 0.429 ...
## $ rm : num [1:480] 5.8 6.34 6.82 6.33 5.9 ...
## $ age : num [1:480] 89 96.4 76.5 98.2 52.3 18.4 100 68.2 45.8 34.5 ...
## $ dis : num [1:480] 1.9 2.07 1.79 2.11 8.01 ...
## $ rad : num [1:480] 24 24 24 4 3 4 24 5 5 1 ...
## $ tax : num [1:480] 666 666 666 437 352 289 666 279 398 335 ...
## $ ptratio: num [1:480] 20.2 20.2 20.2 21.2 18.8 16 20.2 19.2 18.7 19.7 ...
## $ black : num [1:480] 353 318 48.5 394.7 364.6 ...
## $ lstat : num [1:480] 14.6 17.8 22.7 17 12.7 ...
## $ medv : num [1:480] 16.8 14.9 8.4 18.1 17.2 23.9 50 18.9 20.8 26.6 ...
## - attr(*, "spec")=
## .. cols(
## .. crim = col_double(),
## .. zn = col_double(),
## .. indus = col_double(),
## .. chas = col_double(),
## .. nox = col_double(),
## .. rm = col_double(),
## .. age = col_double(),
## .. dis = col_double(),
## .. rad = col_double(),
## .. tax = col_double(),
## .. ptratio = col_double(),
## .. black = col_double(),
## .. lstat = col_double(),
## .. medv = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
test <- read_csv("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Homework 5/test.csv")
## Rows: 26 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(test)
## spc_tbl_ [26 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ crim : num [1:26] 0.0324 0.7503 0.0519 0.169 0.59 ...
## $ zn : num [1:26] 0 0 0 0 0 0 0 0 0 0 ...
## $ indus : num [1:26] 2.18 8.14 4.49 25.65 21.89 ...
## $ chas : num [1:26] 0 0 0 0 0 0 1 0 0 0 ...
## $ nox : num [1:26] 0.458 0.538 0.449 0.581 0.624 0.624 0.871 0.605 0.605 0.51 ...
## $ rm : num [1:26] 7 5.92 6.01 5.99 6.37 ...
## $ age : num [1:26] 45.8 94.1 45.1 88.4 97.9 93.5 88 97.4 100 84.1 ...
## $ dis : num [1:26] 6.06 4.4 4.43 1.99 2.33 ...
## $ rad : num [1:26] 3 4 3 2 4 4 5 5 5 5 ...
## $ tax : num [1:26] 222 307 247 188 437 437 403 403 403 296 ...
## $ ptratio: num [1:26] 18.7 21 18.5 19.1 21.2 21.2 14.7 14.7 14.7 16.6 ...
## $ black : num [1:26] 395 394 396 385 386 ...
## $ lstat : num [1:26] 2.94 16.3 12.86 14.81 11.12 ...
## $ medv : num [1:26] 33.4 15.6 22.5 21.4 23 17.4 15.3 41.3 24.3 23.6 ...
## - attr(*, "spec")=
## .. cols(
## .. crim = col_double(),
## .. zn = col_double(),
## .. indus = col_double(),
## .. chas = col_double(),
## .. nox = col_double(),
## .. rm = col_double(),
## .. age = col_double(),
## .. dis = col_double(),
## .. rad = col_double(),
## .. tax = col_double(),
## .. ptratio = col_double(),
## .. black = col_double(),
## .. lstat = col_double(),
## .. medv = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
lreg <- glmnet(x = as.matrix(train[, -14]), y = as.matrix(train[, 14]), alpha = 1)
lambda_info <- cv.glmnet(x = as.matrix(train[, -14]), y = as.matrix(train[, 14]), alpha = 1)
min_lambda <- lambda_info$lambda.min
min_lambda
## [1] 0.03340422
lasso_pred <- predict(lreg, s = min_lambda, newx = as.matrix(test[,-14]))
lasso_mse <- mean((test[, 14] - lasso_pred)^2)
## Warning in mean.default((test[, 14] - lasso_pred)^2): argument is not numeric
## or logical: returning NA
lasso_mse
## [1] NA
coef(lreg, s = min_lambda)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 33.981147418
## crim -0.097167906
## zn 0.041696418
## indus .
## chas 2.857034190
## nox -15.271209952
## rm 3.782960879
## age .
## dis -1.367450786
## rad 0.241944052
## tax -0.009293085
## ptratio -0.902839603
## black 0.008915650
## lstat -0.547559168
Based on the coefficients returned by the LASSO regression model, it appears that the covariate age was insignificant, as its coefficient is represented by a dot (‘.’) indicating that its value is estimated to be zero by the model.