## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caTools' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caTools'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\caTools\libs\x64\caTools.dll
## to C:\Users\Rache\AppData\Local\R\win-library\4.4\caTools\libs\x64\caTools.dll:
## Permission denied
## Warning: restored 'caTools'
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
## Warning: package 'caTools' was built under R version 4.4.3
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'MASS' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'MASS'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\MASS\libs\x64\MASS.dll to
## C:\Users\Rache\AppData\Local\R\win-library\4.4\MASS\libs\x64\MASS.dll:
## Permission denied
## Warning: restored 'MASS'
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
## Warning: package 'MASS' was built under R version 4.4.3
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'neuralnet' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
## Warning: package 'neuralnet' was built under R version 4.4.3
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
## Warning: package 'ggplot2' was built under R version 4.4.3
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'Rdimtools' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'Rdimtools'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\Rdimtools\libs\x64\Rdimtools.dll
## to
## C:\Users\Rache\AppData\Local\R\win-library\4.4\Rdimtools\libs\x64\Rdimtools.dll:
## Permission denied
## Warning: restored 'Rdimtools'
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
## Warning: package 'Rdimtools' was built under R version 4.4.3
## ** ------------------------------------------------------- **
## ** Rdimtools || Dimension Reduction and Estimation Toolbox
## **
## ** Version : 1.1.2 (2025)
## ** Maintainer : Kisung You (kisungyou@outlook.com)
## ** Website : https://kisungyou.com/Rdimtools/
## **
## ** Please see 'citation('Rdimtools)' to cite the package.
## ** ------------------------------------------------------- **
#HW 3 Rachel Konshok
#07/26/2025
#Question 1:
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
#To create the data set suggested in the homework we will extract only crim, indus, nox, dis, and rm for x and medv for y
dataset = Boston[,c(1,3,5,8,6,14)]
head(dataset)
## crim indus nox dis rm medv
## 1 0.00632 2.31 0.538 4.0900 6.575 24.0
## 2 0.02731 7.07 0.469 4.9671 6.421 21.6
## 3 0.02729 7.07 0.469 4.9671 7.185 34.7
## 4 0.03237 2.18 0.458 6.0622 6.998 33.4
## 5 0.06905 2.18 0.458 6.0622 7.147 36.2
## 6 0.02985 2.18 0.458 6.0622 6.430 28.7
#the summary statistics of data set are shown below
summary(dataset)
## crim indus nox dis
## Min. : 0.00632 Min. : 0.46 Min. :0.3850 Min. : 1.130
## 1st Qu.: 0.08205 1st Qu.: 5.19 1st Qu.:0.4490 1st Qu.: 2.100
## Median : 0.25651 Median : 9.69 Median :0.5380 Median : 3.207
## Mean : 3.61352 Mean :11.14 Mean :0.5547 Mean : 3.795
## 3rd Qu.: 3.67708 3rd Qu.:18.10 3rd Qu.:0.6240 3rd Qu.: 5.188
## Max. :88.97620 Max. :27.74 Max. :0.8710 Max. :12.127
## rm medv
## Min. :3.561 Min. : 5.00
## 1st Qu.:5.886 1st Qu.:17.02
## Median :6.208 Median :21.20
## Mean :6.285 Mean :22.53
## 3rd Qu.:6.623 3rd Qu.:25.00
## Max. :8.780 Max. :50.00
#a: fit neural network with 3 layers and 4 nodes in each layer
#first we're going to scale the data set and get a training and test data set
max = apply(dataset, 2, max)
min = apply(dataset, 2, min)
scaled_dataset = as.data.frame(scale(dataset, center = min, scale = max - min))
head(dataset)
## crim indus nox dis rm medv
## 1 0.00632 2.31 0.538 4.0900 6.575 24.0
## 2 0.02731 7.07 0.469 4.9671 6.421 21.6
## 3 0.02729 7.07 0.469 4.9671 7.185 34.7
## 4 0.03237 2.18 0.458 6.0622 6.998 33.4
## 5 0.06905 2.18 0.458 6.0622 7.147 36.2
## 6 0.02985 2.18 0.458 6.0622 6.430 28.7
head(scaled_dataset)
## crim indus nox dis rm medv
## 1 0.0000000000 0.06781525 0.3148148 0.2692031 0.5775053 0.4222222
## 2 0.0002359225 0.24230205 0.1728395 0.3489620 0.5479977 0.3688889
## 3 0.0002356977 0.24230205 0.1728395 0.3489620 0.6943859 0.6600000
## 4 0.0002927957 0.06304985 0.1502058 0.4485446 0.6585553 0.6311111
## 5 0.0007050701 0.06304985 0.1502058 0.4485446 0.6871048 0.6933333
## 6 0.0002644715 0.06304985 0.1502058 0.4485446 0.5497222 0.5266667
set.seed(123)
split = sample.split(scaled_dataset$medv, SplitRatio = .6)
training_set = subset(scaled_dataset, split == TRUE)
test_set = subset(scaled_dataset, split == FALSE)
head(training_set)
## crim indus nox dis rm medv
## 1 0.0000000000 0.06781525 0.3148148 0.2692031 0.5775053 0.4222222
## 2 0.0002359225 0.24230205 0.1728395 0.3489620 0.5479977 0.3688889
## 3 0.0002356977 0.24230205 0.1728395 0.3489620 0.6943859 0.6600000
## 4 0.0002927957 0.06304985 0.1502058 0.4485446 0.6585553 0.6311111
## 6 0.0002644715 0.06304985 0.1502058 0.4485446 0.5497222 0.5266667
## 9 0.0023032514 0.27162757 0.2860082 0.4503542 0.3966277 0.2555556
head(test_set)
## crim indus nox dis rm medv
## 5 0.0007050701 0.06304985 0.1502058 0.4485446 0.6871048 0.6933333
## 7 0.0009213230 0.27162757 0.2860082 0.4029226 0.4696302 0.3977778
## 8 0.0015536719 0.27162757 0.2860082 0.4383872 0.5002874 0.4911111
## 10 0.0018401733 0.27162757 0.2860082 0.4967309 0.4680973 0.3088889
## 15 0.0070994813 0.28152493 0.3148148 0.3030218 0.4857252 0.2933333
## 16 0.0069806771 0.28152493 0.3148148 0.3063591 0.4355240 0.3311111
#Fit the neural network with 3 layers and 4 nodes in each layer
set.seed(3)
Neural_Net = neuralnet(formula = medv ~., data = training_set, hidden = c(4,4,4), linear.output = TRUE)
plot(Neural_Net)
#visualized the neural network with 3 layers and 4 nodes in each layer
#To predict the response I first need to find where in the data x1 = .14, x2 = 7.87 and so on
Boston[Boston$indus == 7.87, ]
## crim zn indus chas nox rm age dis rad tax ptratio black
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 396.90
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 386.63
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71
## 11 0.22489 12.5 7.87 0 0.524 6.377 94.3 6.3467 5 311 15.2 392.52
## 12 0.11747 12.5 7.87 0 0.524 6.009 82.9 6.2267 5 311 15.2 396.90
## 13 0.09378 12.5 7.87 0 0.524 5.889 39.0 5.4509 5 311 15.2 390.50
## lstat medv
## 7 12.43 22.9
## 8 19.15 27.1
## 9 29.93 16.5
## 10 17.10 18.9
## 11 20.45 15.0
## 12 13.27 18.9
## 13 15.71 21.7
#The row that matches the values need is row 8
test_prediction = predict(Neural_Net, test_set[3,])
#above we use test_set column 3 because that correlates to data set row 8 which is when the values match
#we can check this by using #test_set[3,] which shows the row number is 8
test_prediction
## [,1]
## 8 0.3168627
#the response is .3500293 using this model
#the true value from our data set (scaled_datast[8,]) is .4911 this is the scaled number for 27.1
value1 = (.3500293-.49111)^2
value1
## [1] 0.01990376
#Thus our predicted error from the scaled numbers is .01990376
sqrt(value1)
## [1] 0.1410807
#The square root of that is .1410807 meaning the error in the model is about ~$141 which is great
#now to repeat for B with 2 layers and 3 nodes in each layer
set.seed(3)
Neural_Net2 = neuralnet(formula = medv ~., data = training_set, hidden = c(3,3), linear.output = TRUE)
plot(Neural_Net2)
test_prediction2 = predict(Neural_Net2, test_set[3,])
test_prediction2
## [,1]
## 8 0.3042053
#the response is .3042053 using this model
#the true value from our data set (scaled_datast[8,]) is .4911 this is the scaled number for 27.1
value2 = (.3042053-.49111)^2
value2
## [1] 0.03493337
#Thus our predicted error from the scaled numbers is .03493337
sqrt(value2)
## [1] 0.1869047
#the sqaure root is .1869047 meaning the error is ~$186
#therefore the second model is not as good as the first model
#moving on to question 2: non parametric regression for data set Boston in MASS
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
#first we're going to extract just age and medv for these problems
data2 = Boston[,c(7,14)]
head(data2)
## age medv
## 1 65.2 24.0
## 2 78.9 21.6
## 3 61.1 34.7
## 4 45.8 33.4
## 5 54.2 36.2
## 6 58.7 28.7
localaverage=with(data2, supsmu(age, medv))
head(localaverage)
## $x
## [1] 2.9 6.0 6.2 6.5 6.6 6.8 7.8 8.4 8.9 9.8 9.9 10.0
## [13] 13.0 13.9 14.7 15.3 15.7 15.8 16.3 17.0 17.2 17.5 17.7 17.8
## [25] 18.4 18.5 18.8 19.1 19.5 20.1 20.8 21.1 21.4 21.5 21.8 21.9
## [37] 22.3 22.9 23.3 23.4 24.8 25.8 26.3 27.6 27.7 27.9 28.1 28.4
## [49] 28.8 28.9 29.1 29.2 29.3 29.7 30.2 30.8 31.1 31.3 31.5 31.9
## [61] 32.0 32.1 32.2 32.3 32.9 33.0 33.1 33.2 33.3 33.5 33.8 34.1
## [73] 34.2 34.5 34.9 35.7 35.9 36.1 36.6 36.8 36.9 37.2 37.3 37.8
## [85] 38.1 38.3 38.4 38.5 38.9 39.0 40.0 40.1 40.3 40.4 40.5 41.1
## [97] 41.5 41.9 42.1 42.2 42.3 42.4 42.6 42.8 43.4 43.7 44.4 45.0
## [109] 45.1 45.4 45.6 45.7 45.8 46.3 46.7 47.2 47.4 47.6 48.0 48.2
## [121] 48.5 49.0 49.1 49.3 49.7 49.9 51.0 51.8 51.9 52.3 52.5 52.6
## [133] 52.8 52.9 53.2 53.6 53.7 53.8 54.0 54.2 54.3 54.4 56.0 56.1
## [145] 56.4 56.5 56.7 56.8 57.8 58.0 58.1 58.4 58.5 58.7 58.8 59.1
## [157] 59.5 59.6 59.7 61.1 61.4 61.5 61.8 62.0 62.2 62.5 62.8 63.0
## [169] 63.1 64.5 64.7 65.1 65.2 65.3 65.4 66.1 66.2 66.5 66.6 67.0
## [181] 67.2 67.6 67.8 68.1 68.2 68.7 68.8 69.1 69.5 69.6 69.7 70.2
## [193] 70.3 70.4 70.6 71.0 71.3 71.6 71.7 71.9 72.5 72.7 72.9 73.1
## [205] 73.3 73.4 73.5 73.9 74.3 74.4 74.5 74.8 74.9 75.0 76.0 76.5
## [217] 76.7 76.9 77.0 77.3 77.7 77.8 78.1 78.3 78.7 78.9 79.2 79.7
## [229] 79.8 79.9 80.3 80.8 81.3 81.6 81.7 81.8 82.0 82.5 82.6 82.8
## [241] 82.9 83.0 83.2 83.3 83.4 83.5 83.7 84.0 84.1 84.2 84.4 84.5
## [253] 84.6 84.7 85.1 85.2 85.4 85.5 85.7 85.9 86.1 86.3 86.5 86.9
## [265] 87.3 87.4 87.6 87.9 88.0 88.2 88.4 88.5 88.6 88.8 89.0 89.1
## [277] 89.2 89.3 89.4 89.5 89.6 89.8 89.9 90.0 90.3 90.4 90.7 90.8
## [289] 91.0 91.1 91.2 91.3 91.4 91.5 91.6 91.7 91.8 91.9 92.1 92.2
## [301] 92.4 92.6 92.7 92.9 93.0 93.3 93.4 93.5 93.6 93.8 93.9 94.0
## [313] 94.1 94.3 94.4 94.5 94.6 94.7 94.8 94.9 95.0 95.2 95.3 95.4
## [325] 95.6 95.7 95.8 96.0 96.1 96.2 96.4 96.6 96.7 96.8 96.9 97.0
## [337] 97.1 97.2 97.3 97.4 97.5 97.7 97.8 97.9 98.0 98.1 98.2 98.3
## [349] 98.4 98.5 98.7 98.8 98.9 99.1 99.3 100.0
##
## $y
## [1] 26.63125 26.88226 26.89845 26.92274 26.93084 26.94703 27.02800 27.07658
## [9] 27.11707 27.18994 27.19803 27.20613 27.46354 27.51415 27.55332 27.57800
## [17] 27.58941 27.58571 27.58321 27.57407 27.57108 27.56931 27.57192 27.57646
## [25] 27.55442 27.53671 27.51811 27.50311 27.49319 27.49054 27.49078 27.48877
## [33] 27.49055 27.49565 27.48971 27.48267 27.46942 27.46034 27.45879 27.46356
## [41] 27.44522 27.42466 27.41644 27.38360 27.36432 27.34221 27.32425 27.30033
## [49] 27.26692 27.24974 27.22982 27.22574 27.22270 27.17352 27.09896 26.99056
## [57] 26.93007 26.89506 26.85955 26.78816 26.78580 26.78099 26.77434 26.74628
## [65] 26.66064 26.64312 26.62706 26.61120 26.59589 26.57744 26.55465 26.53510
## [73] 26.52258 26.50260 26.47857 26.43469 26.41964 26.40453 26.37373 26.36316
## [81] 26.35977 26.34078 26.33379 26.30392 26.28609 26.27452 26.26830 26.26521
## [89] 26.24627 26.24508 26.18526 26.18233 26.16679 26.15796 26.14113 26.09446
## [97] 26.06273 26.02877 26.01397 26.00395 25.99869 25.99554 25.98104 25.96413
## [105] 25.90141 25.87077 25.78894 25.71437 25.70282 25.66892 25.64960 25.64126
## [113] 25.63674 25.58297 25.53973 25.48282 25.45777 25.43055 25.38966 25.36786
## [121] 25.33436 25.28702 25.27975 25.26151 25.23055 25.21486 25.13919 25.09030
## [129] 25.08903 25.06979 25.06192 25.05666 25.04639 25.03813 25.01668 24.99105
## [137] 24.97605 24.96495 24.95012 24.93544 24.93259 24.93500 24.78993 24.78321
## [145] 24.75900 24.75333 24.73628 24.72689 24.61493 24.59481 24.58370 24.54890
## [153] 24.53826 24.51625 24.50732 24.47178 24.42457 24.41241 24.39962 24.25317
## [161] 24.21849 24.19929 24.15664 24.13317 24.11417 24.08166 24.05596 24.03894
## [169] 24.02984 23.87320 23.84971 23.80371 23.78967 23.77594 23.76653 23.68814
## [177] 23.67951 23.64760 23.63538 23.59049 23.56739 23.52244 23.50596 23.47808
## [185] 23.46975 23.42854 23.41928 23.39645 23.36646 23.36086 23.35430 23.31690
## [193] 23.30828 23.29238 23.26859 23.23698 23.21414 23.19086 23.18438 23.16497
## [201] 23.10565 23.08232 23.05823 23.03640 23.01506 23.00664 23.00029 22.94831
## [209] 22.89555 22.88169 22.86951 22.82997 22.81913 22.80781 22.66129 22.59045
## [217] 22.56458 22.53466 22.51764 22.47104 22.40497 22.38442 22.33916 22.30864
## [225] 22.23925 22.20162 22.14367 22.04416 22.02480 22.00315 21.91804 21.80616
## [233] 21.68666 21.61220 21.58891 21.56764 21.51505 21.39195 21.38386 21.34239
## [241] 21.32216 21.30659 21.26550 21.24366 21.22339 21.20423 21.17398 21.13241
## [249] 21.11367 21.09437 21.06236 21.05097 21.03906 21.03278 20.99261 20.98489
## [257] 20.96322 20.95405 20.92534 20.89888 20.86933 20.84166 20.81603 20.75834
## [265] 20.69846 20.67745 20.64398 20.58336 20.56128 20.51601 20.45909 20.42324
## [273] 20.38696 20.30950 20.22888 20.18437 20.14003 20.09446 20.05158 20.00848
## [281] 19.96569 19.86393 19.81216 19.76195 19.60249 19.55123 19.38330 19.33088
## [289] 19.22325 19.17471 19.12716 19.07794 19.02726 18.97932 18.93545 18.89577
## [297] 18.85725 18.81986 18.76400 18.74069 18.70232 18.67526 18.66923 18.67081
## [305] 18.67747 18.70686 18.72749 18.75297 18.76855 18.80942 18.82409 18.83546
## [313] 18.83526 18.84544 18.84897 18.84968 18.85332 18.83947 18.81297 18.77173
## [321] 18.72456 18.61106 18.54297 18.45703 18.27638 18.17443 18.06366 17.84104
## [329] 17.75799 17.65985 17.47923 17.31918 17.26406 17.23275 17.19137 17.15916
## [337] 17.13874 17.13450 17.12313 17.13896 17.18844 17.21216 17.17070 17.03312
## [345] 16.80016 16.62456 16.50384 16.34404 16.14298 15.99432 15.73886 15.61511
## [353] 15.33680 15.10889 15.12312 15.62703
plot(data2$age,data2$medv,main="local averaging fitted curve")
lines(localaverage$x,localaverage$y,col="blue")
#we can compare the above graph which uses the local averaging curve to a classic linear model
lm1 <- lm(medv ~ age, data = data2)
lm1
##
## Call:
## lm(formula = medv ~ age, data = data2)
##
## Coefficients:
## (Intercept) age
## 30.9787 -0.1232
summary(lm1)$r.squared
## [1] 0.1420947
#the r^2 value for the lm model was .142 and the residuals vs. fitted plot was not as good of a fit as the local averaging plot was
coefs <- coef(lm1)
xseq <- seq(min(Boston$age), max(Boston$age), length.out = 100)
yhat <- coefs[1] + coefs[2] * xseq
plot(Boston$age, Boston$medv,
xlab = "Age",
ylab = "Medv",
main = "Linear Model",
pch = 19, col = "blue")
abline(lm1, col = "red", lwd = 3, lty = 8)
#Now to move on to the gaussian kernel method
#Visualizing this method at bandwidth 10, 5, and 1 since it's harder to see a difference with so many varaibles
kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 10)
plot(data2$age,data2$medv,main="N-W Kernel regression")
lines(kerreg$x,kerreg$y,col='red', lwd = 3, lty = 8)
kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 1)
plot(data2$age,data2$medv,main="N-W Kernel regression")
lines(kerreg$x,kerreg$y,col='green', lwd = 3, lty = 8)
kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 5)
plot(data2$age,data2$medv,main="N-W Kernel regression")
lines(kerreg$x,kerreg$y,col='blue', lwd = 3, lty = 8)
#now for local polynomial regression
xx2=sort(data2$age, index.return = TRUE)
x2=xx2$x
y2=data2$medv[xx2$ix]
locLinear <-loess(y2~x2,degree=1) # local linear
locQuad <-loess(y2~x2,degree=2) # local quadratic
head(locLinear$fitted)
## [1] 28.70314 28.48887 28.47518 28.45466 28.44783 28.44783
head(locQuad$fitted)
## [1] 27.87377 27.83392 27.83079 27.82596 27.82432 27.82432
plot(x2,y2,main="Local linear regression")
lines(x2,locLinear$fitted,lwd = 3, lty = 8, col = "purple")
plot(x2,y2,main="Local Quadratic regression")
lines(x2,locQuad$fitted,lwd = 3, lty = 8, col = "yellow")
#########################
## predict the house price of a house with age x = 20 and 50
#########################
## predict the house price of a house with age x0
locLinear <-loess(medv ~ age,degree=1,span = 0.75, data = data2)
locQuad <-loess(medv~age,degree=2,span=0.75, data = data2)
kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 15)
localaverage=supsmu(data2$age,data2$medv,bass=10)
houseAge=c(20, 50)
Linear.predicted.Price=predict(locLinear,houseAge)
Linear.predicted.Price
## [1] 27.54145 25.33634
Quadratic.predicted.Price=predict(locQuad,houseAge)
Quadratic.predicted.Price
## [1] 27.44647 25.48815
NW.predicted.Price=NULL
for (re in 1:length(houseAge)){
Index=which.min(abs(kerreg$x-houseAge[re]))
NW.predicted.Price[re]=kerreg$y[Index]
}
NW.predicted.Price
## [1] 27.55588 25.03678
LA.predicted.Price=NULL
for (re in 1:length(houseAge)){
Index=which.min(abs(localaverage$x-houseAge[re]))
LA.predicted.Price[re]=localaverage$y[Index]
}
LA.predicted.Price
## [1] 27.48646 25.31111
#kept the same variable name but houseAge 20 and 50 is equal to x= 20 and 50
#in this case x = age and y = medv
houseAge
## [1] 20 50
Linear.predicted.Price
## [1] 27.54145 25.33634
Quadratic.predicted.Price
## [1] 27.44647 25.48815
NW.predicted.Price
## [1] 27.55588 25.03678
LA.predicted.Price
## [1] 27.48646 25.31111
target <- 20
closest_row <- data2[which.min(abs(data2$age - target)), ]
closest_row
## age medv
## 299 20.1 22.5
target2 = 50
closest_row2 <- data2[which.min(abs(data2$age - target2)), ]
closest_row2
## age medv
## 323 49.9 20.4
#The correct answers based on the search above are when x = 20 (closest value 20.1) then medv or y = 22.5
#when x = 50 (closest value 49.9) medv = 20.4
#with that in mind linear predicted when x=20 y=27.54145 and x=50 y=25.33634
#quadratic x=20 y=27.44647 and x=50 y=25.48815
#nw predicted x=20 y=27.55588 and x=50 y=25.03678
#la predicted x=20 y=27.48646 and x=50 y=25.31111
#interestingly these are all close to each other but not to the values in the chart. I'm not quite sure what happened there.
#Question 3 below
head(dataset)
## crim indus nox dis rm medv
## 1 0.00632 2.31 0.538 4.0900 6.575 24.0
## 2 0.02731 7.07 0.469 4.9671 6.421 21.6
## 3 0.02729 7.07 0.469 4.9671 7.185 34.7
## 4 0.03237 2.18 0.458 6.0622 6.998 33.4
## 5 0.06905 2.18 0.458 6.0622 7.147 36.2
## 6 0.02985 2.18 0.458 6.0622 6.430 28.7
datasetnew = dataset[,1:5]
head(datasetnew)
## crim indus nox dis rm
## 1 0.00632 2.31 0.538 4.0900 6.575
## 2 0.02731 7.07 0.469 4.9671 6.421
## 3 0.02729 7.07 0.469 4.9671 7.185
## 4 0.03237 2.18 0.458 6.0622 6.998
## 5 0.06905 2.18 0.458 6.0622 7.147
## 6 0.02985 2.18 0.458 6.0622 6.430
#dataset now is just Boston with five variables x1 = crim, x2 = indus, x3 = nox, x4 = dis, and x5 = rm for dimension reduction
#Starting with principal component analysis
resultspca = prcomp(datasetnew, scale = TRUE)
resultspca$rotation = -1*resultspca$rotation
resultspca$rotation
## PC1 PC2 PC3 PC4 PC5
## crim -0.3536280 -0.009216982 0.9348817 -0.01274628 -0.026381521
## indus -0.5211711 0.017127860 -0.1973602 0.74195861 -0.372335441
## nox -0.5250662 0.180439311 -0.1740364 -0.01118060 0.813224145
## dis 0.4988737 -0.321526658 0.2068162 0.63690335 0.446460277
## rm 0.2806392 0.929345978 0.1182520 0.20872009 0.003169356
resultspca$sdev^2 /sum(resultspca$sdev^2)
## [1] 0.58677604 0.17305491 0.14481525 0.05270987 0.04264392
#58% can be explained by principal component 1, 17% by 2, 14% by 3, 5% by 4 and 4% by 5 (roughly)
resultspca
## Standard deviations (1, .., p=5):
## [1] 1.7128573 0.9302014 0.8509267 0.5133706 0.4617571
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## crim -0.3536280 -0.009216982 0.9348817 -0.01274628 -0.026381521
## indus -0.5211711 0.017127860 -0.1973602 0.74195861 -0.372335441
## nox -0.5250662 0.180439311 -0.1740364 -0.01118060 0.813224145
## dis 0.4988737 -0.321526658 0.2068162 0.63690335 0.446460277
## rm 0.2806392 0.929345978 0.1182520 0.20872009 0.003169356
biplot(resultspca, scale = 0)
#above is the 'basic' pca plot for this data set
#scree plot
var_explained = resultspca$sdev^2 / sum(resultspca$sdev^2)
qplot(x = 1:5, y = var_explained, geom = c("point", "line")) +
coord_cartesian(ylim = c(0, 1)) +
labs(
x = "Principal Component",
y = "Variance Explained",
title = "Scree Plot"
)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#local linear imbedding
mass = as.matrix(datasetnew)
s.mass = scale(mass, center = TRUE, scale = TRUE)
LLE = do.lle(s.mass, ndim = 2, type=list(type = "proportion", proportion = .1))
pc1=LLE$Y[,1]
pc2 = LLE$Y[,2]
plot(pc1,pc2,main="LLE component 1 and component 2,10% neighbor")
LLE = do.lle(s.mass, ndim = 2, type=list(type = "proportion", proportion = .2))
pc1=LLE$Y[,1]
pc2 = LLE$Y[,2]
plot(pc1,pc2,main="LLE component 1 and component 2,20% neighbor")
LLE = do.lle(s.mass, ndim = 2, type=list(type = "proportion", proportion = .5))
pc1=LLE$Y[,1]
pc2 = LLE$Y[,2]
plot(pc1,pc2,main="LLE component 1 and component 2,50% neighbor")
head(LLE$Y)
## [,1] [,2]
## [1,] 0.02449657 0.012017386
## [2,] 0.02987192 -0.007409192
## [3,] 0.03499297 0.003688514
## [4,] 0.04831477 -0.015217832
## [5,] 0.04933405 -0.013011205
## [6,] 0.04440351 -0.023724474
head(resultspca$x)
## PC1 PC2 PC3 PC4 PC5
## 1 -1.080364 -0.2948576 0.03521451 0.77220411 -0.4368063
## 2 -1.176831 0.1383458 0.00601154 0.03123117 0.1205675
## 3 -1.481989 -0.8721908 -0.12256921 -0.19572340 0.1171212
## 4 -2.087864 -0.4282909 -0.35640356 0.05640554 -0.3024068
## 5 -2.145869 -0.6253327 -0.38546726 0.01219782 -0.3029664
## 6 -1.861097 0.3229954 -0.26053399 0.22513241 -0.2998524
#Kernalpca
install.packages("kernlab")
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'kernlab' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'kernlab'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\kernlab\libs\x64\kernlab.dll
## to C:\Users\Rache\AppData\Local\R\win-library\4.4\kernlab\libs\x64\kernlab.dll:
## Permission denied
## Warning: restored 'kernlab'
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
kpc = kpca(~.,data=datasetnew[,-5], kernel="rbfdot",
kpar=list(sigma=.2),features=2)
head(pcv(kpc))
## [,1] [,2]
## [1,] -0.03310871 -0.17806906
## [2,] 0.29477457 0.05452353
## [3,] 0.29477381 0.05452298
## [4,] -0.03783349 -0.17562763
## [5,] -0.03781525 -0.17563521
## [6,] -0.03783497 -0.17562357
x3 = pcv(kpc)
pairs(datasetnew[,])
plot(x3[,1], x3[,2],
xlab = "PC1",
ylab = "PC2",
main = "Kernel PCA Projection",
pch = 19, col = 'blue')
#From our initial PCA analysis using prcomp we got a good breakdown of variance by prinicpal component on the scree plot
#however the plot was a bit too clutered with the number of data points we had to really see good differences
#when moving on too LLE I looked at 10%, 20%, and 50% neighbor
#interestingly 20% looks far more like 50% than it does 10% showing how the data changed depending on how strict of parameters one uses
#in our kernal pca we see the non-linear separation of our data
#In this case I think the LLE plots showed the distinction and separation when the dimensions were reduced the best
#the prcomp didn't show enough distinction with the amount of data points it had, and kernal pca, while useful wasn't as specific in this case
#Question 4
head(dataset)
## crim indus nox dis rm medv
## 1 0.00632 2.31 0.538 4.0900 6.575 24.0
## 2 0.02731 7.07 0.469 4.9671 6.421 21.6
## 3 0.02729 7.07 0.469 4.9671 7.185 34.7
## 4 0.03237 2.18 0.458 6.0622 6.998 33.4
## 5 0.06905 2.18 0.458 6.0622 7.147 36.2
## 6 0.02985 2.18 0.458 6.0622 6.430 28.7
data3 = dataset[,2:3]
head(data3)
## indus nox
## 1 2.31 0.538
## 2 7.07 0.469
## 3 7.07 0.469
## 4 2.18 0.458
## 5 2.18 0.458
## 6 2.18 0.458
#now the dataset is just indus and nox
t.test(data3$indus,mu=11)
##
## One Sample t-test
##
## data: data3$indus
## t = 0.44848, df = 505, p-value = 0.654
## alternative hypothesis: true mean is not equal to 11
## 95 percent confidence interval:
## 10.53759 11.73596
## sample estimates:
## mean of x
## 11.13678
#test whether indus mean = 11, p value hear is .654 which is far more than the minimum of .1 to accept so H0 is not rejected and mean is close to 11
#ks test
head(data3)
## indus nox
## 1 2.31 0.538
## 2 7.07 0.469
## 3 7.07 0.469
## 4 2.18 0.458
## 5 2.18 0.458
## 6 2.18 0.458
newx2 = (data3$indus -mean(data3$indus))/sd(data3$indus)
head(newx2)
## [1] -1.2866362 -0.5927944 -0.5927944 -1.3055857 -1.3055857 -1.3055857
summary(newx2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.5563 -0.8668 -0.2109 0.0000 1.0150 2.4202
newx3 = (data3$nox -mean(data3$nox))/sd(data3$nox)
head(newx3)
## [1] -0.1440749 -0.7395304 -0.7395304 -0.8344581 -0.8344581 -0.8344581
summary(newx3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4644 -0.9121 -0.1441 0.0000 0.5981 2.7296
hist(newx2)
hist(data3$indus)
ks.test(newx2,"pnorm")
## Warning in ks.test.default(newx2, "pnorm"): ties should not be present for the
## one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: newx2
## D = 0.21846, p-value < 2.2e-16
## alternative hypothesis: two-sided
#pvalue hear is 2.2e-10 reject h0 distribution is NOT normal
hist(newx3)
hist(data3$nox)
ks.test(newx3,"pnorm")
## Warning in ks.test.default(newx3, "pnorm"): ties should not be present for the
## one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: newx3
## D = 0.10552, p-value = 2.552e-05
## alternative hypothesis: two-sided
#pvalue is 2.55e-05 reject h0 distribution is not normal
#ad test
install.packages("nortest")
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'nortest' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
library(nortest)
ad.test(newx2)
##
## Anderson-Darling normality test
##
## data: newx2
## A = 22.134, p-value < 2.2e-16
#p value is 2.2e-16 reject H0
ad.test(newx3)
##
## Anderson-Darling normality test
##
## data: newx3
## A = 8.3384, p-value < 2.2e-16
#p value is 2.2e-16 reject H0
#ksd
install.packages('KSD')
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'KSD' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages
library(KSD)
## Warning: package 'KSD' was built under R version 4.4.3
X=newx2
scor=-X ## the score-function of standard normal is equal to -X.
KSD.test=KSD(X, score_function=scor, kernel = "rbf", width = -1, nboot = 2000)
#KSD.test
KSD.test$p ## get the p-value
## [1] 0
#p value = 0
X=newx3
scor=-X ## the score-function of standard normal is equal to -X.
KSD.test=KSD(X, score_function=scor, kernel = "rbf", width = -1, nboot = 2000)
KSD.test$p ## get the p-value
## [1] 0
#p value = 0
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.