HW3

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'caTools' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'caTools'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\caTools\libs\x64\caTools.dll
## to C:\Users\Rache\AppData\Local\R\win-library\4.4\caTools\libs\x64\caTools.dll:
## Permission denied

## Warning: restored 'caTools'

## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

## Warning: package 'caTools' was built under R version 4.4.3

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'MASS' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'MASS'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\MASS\libs\x64\MASS.dll to
## C:\Users\Rache\AppData\Local\R\win-library\4.4\MASS\libs\x64\MASS.dll:
## Permission denied

## Warning: restored 'MASS'

## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

## Warning: package 'MASS' was built under R version 4.4.3

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'neuralnet' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

## Warning: package 'neuralnet' was built under R version 4.4.3

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

## Warning: package 'ggplot2' was built under R version 4.4.3

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'Rdimtools' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'Rdimtools'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\Rdimtools\libs\x64\Rdimtools.dll
## to
## C:\Users\Rache\AppData\Local\R\win-library\4.4\Rdimtools\libs\x64\Rdimtools.dll:
## Permission denied

## Warning: restored 'Rdimtools'

## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

## Warning: package 'Rdimtools' was built under R version 4.4.3

## ** ------------------------------------------------------- **

## ** Rdimtools || Dimension Reduction and Estimation Toolbox

## **

## ** Version    : 1.1.2       (2025)

## ** Maintainer : Kisung You  (kisungyou@outlook.com)

## ** Website    : https://kisungyou.com/Rdimtools/

## **

## ** Please see 'citation('Rdimtools)' to cite the package.

## ** ------------------------------------------------------- **

#HW 3 Rachel Konshok
#07/26/2025 

#Question 1:


head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

#To create the data set suggested in the homework we will extract only crim, indus, nox, dis, and rm for x and medv for y

dataset = Boston[,c(1,3,5,8,6,14)]
head(dataset)

##      crim indus   nox    dis    rm medv
## 1 0.00632  2.31 0.538 4.0900 6.575 24.0
## 2 0.02731  7.07 0.469 4.9671 6.421 21.6
## 3 0.02729  7.07 0.469 4.9671 7.185 34.7
## 4 0.03237  2.18 0.458 6.0622 6.998 33.4
## 5 0.06905  2.18 0.458 6.0622 7.147 36.2
## 6 0.02985  2.18 0.458 6.0622 6.430 28.7

#the summary statistics of data set are shown below
summary(dataset)

##       crim              indus            nox              dis        
##  Min.   : 0.00632   Min.   : 0.46   Min.   :0.3850   Min.   : 1.130  
##  1st Qu.: 0.08205   1st Qu.: 5.19   1st Qu.:0.4490   1st Qu.: 2.100  
##  Median : 0.25651   Median : 9.69   Median :0.5380   Median : 3.207  
##  Mean   : 3.61352   Mean   :11.14   Mean   :0.5547   Mean   : 3.795  
##  3rd Qu.: 3.67708   3rd Qu.:18.10   3rd Qu.:0.6240   3rd Qu.: 5.188  
##  Max.   :88.97620   Max.   :27.74   Max.   :0.8710   Max.   :12.127  
##        rm             medv      
##  Min.   :3.561   Min.   : 5.00  
##  1st Qu.:5.886   1st Qu.:17.02  
##  Median :6.208   Median :21.20  
##  Mean   :6.285   Mean   :22.53  
##  3rd Qu.:6.623   3rd Qu.:25.00  
##  Max.   :8.780   Max.   :50.00

#a: fit neural network with 3 layers and 4 nodes in each layer

#first we're going to scale the data set and get a training and test data set

max = apply(dataset, 2, max)
min = apply(dataset, 2, min)
scaled_dataset = as.data.frame(scale(dataset, center = min, scale = max - min))

head(dataset)

##      crim indus   nox    dis    rm medv
## 1 0.00632  2.31 0.538 4.0900 6.575 24.0
## 2 0.02731  7.07 0.469 4.9671 6.421 21.6
## 3 0.02729  7.07 0.469 4.9671 7.185 34.7
## 4 0.03237  2.18 0.458 6.0622 6.998 33.4
## 5 0.06905  2.18 0.458 6.0622 7.147 36.2
## 6 0.02985  2.18 0.458 6.0622 6.430 28.7

head(scaled_dataset)

##           crim      indus       nox       dis        rm      medv
## 1 0.0000000000 0.06781525 0.3148148 0.2692031 0.5775053 0.4222222
## 2 0.0002359225 0.24230205 0.1728395 0.3489620 0.5479977 0.3688889
## 3 0.0002356977 0.24230205 0.1728395 0.3489620 0.6943859 0.6600000
## 4 0.0002927957 0.06304985 0.1502058 0.4485446 0.6585553 0.6311111
## 5 0.0007050701 0.06304985 0.1502058 0.4485446 0.6871048 0.6933333
## 6 0.0002644715 0.06304985 0.1502058 0.4485446 0.5497222 0.5266667

set.seed(123)
split = sample.split(scaled_dataset$medv, SplitRatio = .6)
training_set = subset(scaled_dataset, split == TRUE)
test_set = subset(scaled_dataset, split == FALSE)

head(training_set)

##           crim      indus       nox       dis        rm      medv
## 1 0.0000000000 0.06781525 0.3148148 0.2692031 0.5775053 0.4222222
## 2 0.0002359225 0.24230205 0.1728395 0.3489620 0.5479977 0.3688889
## 3 0.0002356977 0.24230205 0.1728395 0.3489620 0.6943859 0.6600000
## 4 0.0002927957 0.06304985 0.1502058 0.4485446 0.6585553 0.6311111
## 6 0.0002644715 0.06304985 0.1502058 0.4485446 0.5497222 0.5266667
## 9 0.0023032514 0.27162757 0.2860082 0.4503542 0.3966277 0.2555556

head(test_set)

##            crim      indus       nox       dis        rm      medv
## 5  0.0007050701 0.06304985 0.1502058 0.4485446 0.6871048 0.6933333
## 7  0.0009213230 0.27162757 0.2860082 0.4029226 0.4696302 0.3977778
## 8  0.0015536719 0.27162757 0.2860082 0.4383872 0.5002874 0.4911111
## 10 0.0018401733 0.27162757 0.2860082 0.4967309 0.4680973 0.3088889
## 15 0.0070994813 0.28152493 0.3148148 0.3030218 0.4857252 0.2933333
## 16 0.0069806771 0.28152493 0.3148148 0.3063591 0.4355240 0.3311111

#Fit the neural network with 3 layers and 4 nodes in each layer


set.seed(3)

Neural_Net = neuralnet(formula = medv ~., data = training_set, hidden = c(4,4,4), linear.output = TRUE)

plot(Neural_Net)

#visualized the neural network with 3 layers and 4 nodes in each layer

#To predict the response I first need to find where in the data x1 = .14, x2 = 7.87 and so on

Boston[Boston$indus == 7.87, ]

##       crim   zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5 311    15.2 386.71
## 11 0.22489 12.5  7.87    0 0.524 6.377  94.3 6.3467   5 311    15.2 392.52
## 12 0.11747 12.5  7.87    0 0.524 6.009  82.9 6.2267   5 311    15.2 396.90
## 13 0.09378 12.5  7.87    0 0.524 5.889  39.0 5.4509   5 311    15.2 390.50
##    lstat medv
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 10 17.10 18.9
## 11 20.45 15.0
## 12 13.27 18.9
## 13 15.71 21.7

#The row that matches the values need is row 8

test_prediction = predict(Neural_Net, test_set[3,])

#above we use test_set column 3 because that correlates to data set row 8 which is when the values match
#we can check this by using #test_set[3,] which shows the row number is 8
test_prediction

##        [,1]
## 8 0.3168627

#the response is .3500293 using this model

#the true value from our data set (scaled_datast[8,]) is .4911 this is the scaled number for 27.1

value1 = (.3500293-.49111)^2
value1

## [1] 0.01990376

#Thus our predicted error from the scaled numbers is .01990376
sqrt(value1)

## [1] 0.1410807

#The square root of that is .1410807 meaning the error in the model is about ~$141 which is great


#now to repeat for B with 2 layers and 3 nodes in each layer

set.seed(3)

Neural_Net2 = neuralnet(formula = medv ~., data = training_set, hidden = c(3,3), linear.output = TRUE)

plot(Neural_Net2)


test_prediction2 = predict(Neural_Net2, test_set[3,])

test_prediction2

##        [,1]
## 8 0.3042053

#the response is .3042053 using this model

#the true value from our data set (scaled_datast[8,]) is .4911 this is the scaled number for 27.1

value2 = (.3042053-.49111)^2
value2

## [1] 0.03493337

#Thus our predicted error from the scaled numbers is .03493337
sqrt(value2)

## [1] 0.1869047

#the sqaure root is .1869047 meaning the error is ~$186
#therefore the second model is not as good as the first model

#moving on to question 2: non parametric regression for data set Boston in MASS

head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

#first we're going to extract just age and medv for these problems
data2 = Boston[,c(7,14)]
head(data2)

##    age medv
## 1 65.2 24.0
## 2 78.9 21.6
## 3 61.1 34.7
## 4 45.8 33.4
## 5 54.2 36.2
## 6 58.7 28.7

localaverage=with(data2, supsmu(age, medv))
head(localaverage)

## $x
##   [1]   2.9   6.0   6.2   6.5   6.6   6.8   7.8   8.4   8.9   9.8   9.9  10.0
##  [13]  13.0  13.9  14.7  15.3  15.7  15.8  16.3  17.0  17.2  17.5  17.7  17.8
##  [25]  18.4  18.5  18.8  19.1  19.5  20.1  20.8  21.1  21.4  21.5  21.8  21.9
##  [37]  22.3  22.9  23.3  23.4  24.8  25.8  26.3  27.6  27.7  27.9  28.1  28.4
##  [49]  28.8  28.9  29.1  29.2  29.3  29.7  30.2  30.8  31.1  31.3  31.5  31.9
##  [61]  32.0  32.1  32.2  32.3  32.9  33.0  33.1  33.2  33.3  33.5  33.8  34.1
##  [73]  34.2  34.5  34.9  35.7  35.9  36.1  36.6  36.8  36.9  37.2  37.3  37.8
##  [85]  38.1  38.3  38.4  38.5  38.9  39.0  40.0  40.1  40.3  40.4  40.5  41.1
##  [97]  41.5  41.9  42.1  42.2  42.3  42.4  42.6  42.8  43.4  43.7  44.4  45.0
## [109]  45.1  45.4  45.6  45.7  45.8  46.3  46.7  47.2  47.4  47.6  48.0  48.2
## [121]  48.5  49.0  49.1  49.3  49.7  49.9  51.0  51.8  51.9  52.3  52.5  52.6
## [133]  52.8  52.9  53.2  53.6  53.7  53.8  54.0  54.2  54.3  54.4  56.0  56.1
## [145]  56.4  56.5  56.7  56.8  57.8  58.0  58.1  58.4  58.5  58.7  58.8  59.1
## [157]  59.5  59.6  59.7  61.1  61.4  61.5  61.8  62.0  62.2  62.5  62.8  63.0
## [169]  63.1  64.5  64.7  65.1  65.2  65.3  65.4  66.1  66.2  66.5  66.6  67.0
## [181]  67.2  67.6  67.8  68.1  68.2  68.7  68.8  69.1  69.5  69.6  69.7  70.2
## [193]  70.3  70.4  70.6  71.0  71.3  71.6  71.7  71.9  72.5  72.7  72.9  73.1
## [205]  73.3  73.4  73.5  73.9  74.3  74.4  74.5  74.8  74.9  75.0  76.0  76.5
## [217]  76.7  76.9  77.0  77.3  77.7  77.8  78.1  78.3  78.7  78.9  79.2  79.7
## [229]  79.8  79.9  80.3  80.8  81.3  81.6  81.7  81.8  82.0  82.5  82.6  82.8
## [241]  82.9  83.0  83.2  83.3  83.4  83.5  83.7  84.0  84.1  84.2  84.4  84.5
## [253]  84.6  84.7  85.1  85.2  85.4  85.5  85.7  85.9  86.1  86.3  86.5  86.9
## [265]  87.3  87.4  87.6  87.9  88.0  88.2  88.4  88.5  88.6  88.8  89.0  89.1
## [277]  89.2  89.3  89.4  89.5  89.6  89.8  89.9  90.0  90.3  90.4  90.7  90.8
## [289]  91.0  91.1  91.2  91.3  91.4  91.5  91.6  91.7  91.8  91.9  92.1  92.2
## [301]  92.4  92.6  92.7  92.9  93.0  93.3  93.4  93.5  93.6  93.8  93.9  94.0
## [313]  94.1  94.3  94.4  94.5  94.6  94.7  94.8  94.9  95.0  95.2  95.3  95.4
## [325]  95.6  95.7  95.8  96.0  96.1  96.2  96.4  96.6  96.7  96.8  96.9  97.0
## [337]  97.1  97.2  97.3  97.4  97.5  97.7  97.8  97.9  98.0  98.1  98.2  98.3
## [349]  98.4  98.5  98.7  98.8  98.9  99.1  99.3 100.0
## 
## $y
##   [1] 26.63125 26.88226 26.89845 26.92274 26.93084 26.94703 27.02800 27.07658
##   [9] 27.11707 27.18994 27.19803 27.20613 27.46354 27.51415 27.55332 27.57800
##  [17] 27.58941 27.58571 27.58321 27.57407 27.57108 27.56931 27.57192 27.57646
##  [25] 27.55442 27.53671 27.51811 27.50311 27.49319 27.49054 27.49078 27.48877
##  [33] 27.49055 27.49565 27.48971 27.48267 27.46942 27.46034 27.45879 27.46356
##  [41] 27.44522 27.42466 27.41644 27.38360 27.36432 27.34221 27.32425 27.30033
##  [49] 27.26692 27.24974 27.22982 27.22574 27.22270 27.17352 27.09896 26.99056
##  [57] 26.93007 26.89506 26.85955 26.78816 26.78580 26.78099 26.77434 26.74628
##  [65] 26.66064 26.64312 26.62706 26.61120 26.59589 26.57744 26.55465 26.53510
##  [73] 26.52258 26.50260 26.47857 26.43469 26.41964 26.40453 26.37373 26.36316
##  [81] 26.35977 26.34078 26.33379 26.30392 26.28609 26.27452 26.26830 26.26521
##  [89] 26.24627 26.24508 26.18526 26.18233 26.16679 26.15796 26.14113 26.09446
##  [97] 26.06273 26.02877 26.01397 26.00395 25.99869 25.99554 25.98104 25.96413
## [105] 25.90141 25.87077 25.78894 25.71437 25.70282 25.66892 25.64960 25.64126
## [113] 25.63674 25.58297 25.53973 25.48282 25.45777 25.43055 25.38966 25.36786
## [121] 25.33436 25.28702 25.27975 25.26151 25.23055 25.21486 25.13919 25.09030
## [129] 25.08903 25.06979 25.06192 25.05666 25.04639 25.03813 25.01668 24.99105
## [137] 24.97605 24.96495 24.95012 24.93544 24.93259 24.93500 24.78993 24.78321
## [145] 24.75900 24.75333 24.73628 24.72689 24.61493 24.59481 24.58370 24.54890
## [153] 24.53826 24.51625 24.50732 24.47178 24.42457 24.41241 24.39962 24.25317
## [161] 24.21849 24.19929 24.15664 24.13317 24.11417 24.08166 24.05596 24.03894
## [169] 24.02984 23.87320 23.84971 23.80371 23.78967 23.77594 23.76653 23.68814
## [177] 23.67951 23.64760 23.63538 23.59049 23.56739 23.52244 23.50596 23.47808
## [185] 23.46975 23.42854 23.41928 23.39645 23.36646 23.36086 23.35430 23.31690
## [193] 23.30828 23.29238 23.26859 23.23698 23.21414 23.19086 23.18438 23.16497
## [201] 23.10565 23.08232 23.05823 23.03640 23.01506 23.00664 23.00029 22.94831
## [209] 22.89555 22.88169 22.86951 22.82997 22.81913 22.80781 22.66129 22.59045
## [217] 22.56458 22.53466 22.51764 22.47104 22.40497 22.38442 22.33916 22.30864
## [225] 22.23925 22.20162 22.14367 22.04416 22.02480 22.00315 21.91804 21.80616
## [233] 21.68666 21.61220 21.58891 21.56764 21.51505 21.39195 21.38386 21.34239
## [241] 21.32216 21.30659 21.26550 21.24366 21.22339 21.20423 21.17398 21.13241
## [249] 21.11367 21.09437 21.06236 21.05097 21.03906 21.03278 20.99261 20.98489
## [257] 20.96322 20.95405 20.92534 20.89888 20.86933 20.84166 20.81603 20.75834
## [265] 20.69846 20.67745 20.64398 20.58336 20.56128 20.51601 20.45909 20.42324
## [273] 20.38696 20.30950 20.22888 20.18437 20.14003 20.09446 20.05158 20.00848
## [281] 19.96569 19.86393 19.81216 19.76195 19.60249 19.55123 19.38330 19.33088
## [289] 19.22325 19.17471 19.12716 19.07794 19.02726 18.97932 18.93545 18.89577
## [297] 18.85725 18.81986 18.76400 18.74069 18.70232 18.67526 18.66923 18.67081
## [305] 18.67747 18.70686 18.72749 18.75297 18.76855 18.80942 18.82409 18.83546
## [313] 18.83526 18.84544 18.84897 18.84968 18.85332 18.83947 18.81297 18.77173
## [321] 18.72456 18.61106 18.54297 18.45703 18.27638 18.17443 18.06366 17.84104
## [329] 17.75799 17.65985 17.47923 17.31918 17.26406 17.23275 17.19137 17.15916
## [337] 17.13874 17.13450 17.12313 17.13896 17.18844 17.21216 17.17070 17.03312
## [345] 16.80016 16.62456 16.50384 16.34404 16.14298 15.99432 15.73886 15.61511
## [353] 15.33680 15.10889 15.12312 15.62703

plot(data2$age,data2$medv,main="local averaging fitted curve")
lines(localaverage$x,localaverage$y,col="blue")


#we can compare the above graph which uses the local averaging curve to a classic linear model

lm1 <- lm(medv ~ age, data = data2)
lm1

## 
## Call:
## lm(formula = medv ~ age, data = data2)
## 
## Coefficients:
## (Intercept)          age  
##     30.9787      -0.1232

summary(lm1)$r.squared

## [1] 0.1420947

#the r^2 value for the lm model was .142 and the residuals vs. fitted plot was not as good of a fit as the local averaging plot was

coefs <- coef(lm1)

xseq <- seq(min(Boston$age), max(Boston$age), length.out = 100)

yhat <- coefs[1] + coefs[2] * xseq

plot(Boston$age, Boston$medv,
     xlab = "Age",
     ylab = "Medv",
     main = "Linear Model",
     pch = 19, col = "blue")
abline(lm1, col = "red", lwd = 3, lty = 8)

#Now to move on to the gaussian kernel method
#Visualizing this method at bandwidth 10, 5, and 1 since it's harder to see a difference with so many varaibles

kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 10)
plot(data2$age,data2$medv,main="N-W Kernel regression")
lines(kerreg$x,kerreg$y,col='red', lwd = 3, lty = 8)

kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 1)
plot(data2$age,data2$medv,main="N-W Kernel regression")
lines(kerreg$x,kerreg$y,col='green', lwd = 3, lty = 8)

kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 5)
plot(data2$age,data2$medv,main="N-W Kernel regression")
lines(kerreg$x,kerreg$y,col='blue', lwd = 3, lty = 8)

#now for local polynomial regression

xx2=sort(data2$age, index.return = TRUE)
x2=xx2$x
y2=data2$medv[xx2$ix]



locLinear <-loess(y2~x2,degree=1) # local linear
locQuad <-loess(y2~x2,degree=2) # local quadratic

head(locLinear$fitted)

## [1] 28.70314 28.48887 28.47518 28.45466 28.44783 28.44783

head(locQuad$fitted)

## [1] 27.87377 27.83392 27.83079 27.82596 27.82432 27.82432

plot(x2,y2,main="Local linear regression")
lines(x2,locLinear$fitted,lwd = 3, lty = 8, col = "purple")

plot(x2,y2,main="Local Quadratic regression")
lines(x2,locQuad$fitted,lwd = 3, lty = 8, col = "yellow")

#########################
## predict the house price of a house with age x = 20 and 50


#########################
## predict the house price of a house with age x0

locLinear <-loess(medv ~ age,degree=1,span = 0.75, data = data2)
locQuad <-loess(medv~age,degree=2,span=0.75, data = data2)
kerreg <-ksmooth(data2$age,data2$medv,"normal", bandwidth = 15)
localaverage=supsmu(data2$age,data2$medv,bass=10)

houseAge=c(20, 50)
Linear.predicted.Price=predict(locLinear,houseAge)
Linear.predicted.Price

## [1] 27.54145 25.33634

Quadratic.predicted.Price=predict(locQuad,houseAge)
Quadratic.predicted.Price

## [1] 27.44647 25.48815

NW.predicted.Price=NULL
for (re in 1:length(houseAge)){
  Index=which.min(abs(kerreg$x-houseAge[re]))
  NW.predicted.Price[re]=kerreg$y[Index]
}
NW.predicted.Price

## [1] 27.55588 25.03678

LA.predicted.Price=NULL
for (re in 1:length(houseAge)){
  Index=which.min(abs(localaverage$x-houseAge[re]))
  LA.predicted.Price[re]=localaverage$y[Index]
}
LA.predicted.Price

## [1] 27.48646 25.31111

#kept the same variable name but houseAge 20 and 50 is equal to x= 20 and 50
#in this case x = age and y = medv 
houseAge

## [1] 20 50

Linear.predicted.Price

## [1] 27.54145 25.33634

Quadratic.predicted.Price

## [1] 27.44647 25.48815

NW.predicted.Price

## [1] 27.55588 25.03678

LA.predicted.Price

## [1] 27.48646 25.31111

target <- 20
closest_row <- data2[which.min(abs(data2$age - target)), ]
closest_row

##      age medv
## 299 20.1 22.5

target2 = 50
closest_row2 <- data2[which.min(abs(data2$age - target2)), ]
closest_row2

##      age medv
## 323 49.9 20.4

#The correct answers based on the search above are when x = 20 (closest value 20.1) then medv or y = 22.5
#when x = 50 (closest value 49.9) medv = 20.4

#with that in mind linear predicted when x=20 y=27.54145 and x=50 y=25.33634
#quadratic x=20 y=27.44647 and x=50 y=25.48815
#nw predicted x=20 y=27.55588 and x=50 y=25.03678
#la predicted x=20 y=27.48646 and x=50 y=25.31111

#interestingly these are all close to each other but not to the values in the chart. I'm not quite sure what happened there.

#Question 3 below
head(dataset)

##      crim indus   nox    dis    rm medv
## 1 0.00632  2.31 0.538 4.0900 6.575 24.0
## 2 0.02731  7.07 0.469 4.9671 6.421 21.6
## 3 0.02729  7.07 0.469 4.9671 7.185 34.7
## 4 0.03237  2.18 0.458 6.0622 6.998 33.4
## 5 0.06905  2.18 0.458 6.0622 7.147 36.2
## 6 0.02985  2.18 0.458 6.0622 6.430 28.7

datasetnew = dataset[,1:5]
head(datasetnew)

##      crim indus   nox    dis    rm
## 1 0.00632  2.31 0.538 4.0900 6.575
## 2 0.02731  7.07 0.469 4.9671 6.421
## 3 0.02729  7.07 0.469 4.9671 7.185
## 4 0.03237  2.18 0.458 6.0622 6.998
## 5 0.06905  2.18 0.458 6.0622 7.147
## 6 0.02985  2.18 0.458 6.0622 6.430

#dataset now is just Boston with five variables x1 = crim, x2 = indus, x3 = nox, x4 = dis, and x5 = rm for dimension reduction

#Starting with principal component analysis

resultspca = prcomp(datasetnew, scale = TRUE)

resultspca$rotation = -1*resultspca$rotation

resultspca$rotation

##              PC1          PC2        PC3         PC4          PC5
## crim  -0.3536280 -0.009216982  0.9348817 -0.01274628 -0.026381521
## indus -0.5211711  0.017127860 -0.1973602  0.74195861 -0.372335441
## nox   -0.5250662  0.180439311 -0.1740364 -0.01118060  0.813224145
## dis    0.4988737 -0.321526658  0.2068162  0.63690335  0.446460277
## rm     0.2806392  0.929345978  0.1182520  0.20872009  0.003169356

resultspca$sdev^2 /sum(resultspca$sdev^2)

## [1] 0.58677604 0.17305491 0.14481525 0.05270987 0.04264392

#58% can be explained by principal component 1, 17% by 2, 14% by 3, 5% by 4 and 4% by 5 (roughly)

resultspca

## Standard deviations (1, .., p=5):
## [1] 1.7128573 0.9302014 0.8509267 0.5133706 0.4617571
## 
## Rotation (n x k) = (5 x 5):
##              PC1          PC2        PC3         PC4          PC5
## crim  -0.3536280 -0.009216982  0.9348817 -0.01274628 -0.026381521
## indus -0.5211711  0.017127860 -0.1973602  0.74195861 -0.372335441
## nox   -0.5250662  0.180439311 -0.1740364 -0.01118060  0.813224145
## dis    0.4988737 -0.321526658  0.2068162  0.63690335  0.446460277
## rm     0.2806392  0.929345978  0.1182520  0.20872009  0.003169356

biplot(resultspca, scale = 0)

#above is the 'basic' pca plot for this data set

#scree plot

var_explained = resultspca$sdev^2 / sum(resultspca$sdev^2)



qplot(x = 1:5, y = var_explained, geom = c("point", "line")) +
  coord_cartesian(ylim = c(0, 1)) +
  labs(
    x = "Principal Component",
    y = "Variance Explained",
    title = "Scree Plot"
  )

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#local linear imbedding


mass = as.matrix(datasetnew)
s.mass = scale(mass, center = TRUE, scale = TRUE)

LLE = do.lle(s.mass, ndim = 2, type=list(type = "proportion", proportion = .1))
pc1=LLE$Y[,1]
pc2 = LLE$Y[,2]

plot(pc1,pc2,main="LLE component 1 and component 2,10% neighbor")

LLE = do.lle(s.mass, ndim = 2, type=list(type = "proportion", proportion = .2))
pc1=LLE$Y[,1]
pc2 = LLE$Y[,2]

plot(pc1,pc2,main="LLE component 1 and component 2,20% neighbor")

LLE = do.lle(s.mass, ndim = 2, type=list(type = "proportion", proportion = .5))
pc1=LLE$Y[,1]
pc2 = LLE$Y[,2]

plot(pc1,pc2,main="LLE component 1 and component 2,50% neighbor")

head(LLE$Y)

##            [,1]         [,2]
## [1,] 0.02449657  0.012017386
## [2,] 0.02987192 -0.007409192
## [3,] 0.03499297  0.003688514
## [4,] 0.04831477 -0.015217832
## [5,] 0.04933405 -0.013011205
## [6,] 0.04440351 -0.023724474

head(resultspca$x)

##         PC1        PC2         PC3         PC4        PC5
## 1 -1.080364 -0.2948576  0.03521451  0.77220411 -0.4368063
## 2 -1.176831  0.1383458  0.00601154  0.03123117  0.1205675
## 3 -1.481989 -0.8721908 -0.12256921 -0.19572340  0.1171212
## 4 -2.087864 -0.4282909 -0.35640356  0.05640554 -0.3024068
## 5 -2.145869 -0.6253327 -0.38546726  0.01219782 -0.3029664
## 6 -1.861097  0.3229954 -0.26053399  0.22513241 -0.2998524

#Kernalpca

install.packages("kernlab")

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'kernlab' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'kernlab'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\kernlab\libs\x64\kernlab.dll
## to C:\Users\Rache\AppData\Local\R\win-library\4.4\kernlab\libs\x64\kernlab.dll:
## Permission denied

## Warning: restored 'kernlab'

## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

kpc = kpca(~.,data=datasetnew[,-5], kernel="rbfdot",
           kpar=list(sigma=.2),features=2)
head(pcv(kpc))

##             [,1]        [,2]
## [1,] -0.03310871 -0.17806906
## [2,]  0.29477457  0.05452353
## [3,]  0.29477381  0.05452298
## [4,] -0.03783349 -0.17562763
## [5,] -0.03781525 -0.17563521
## [6,] -0.03783497 -0.17562357

x3 = pcv(kpc)

pairs(datasetnew[,])

plot(x3[,1], x3[,2],
     xlab = "PC1",
     ylab = "PC2",
     main = "Kernel PCA Projection",
     pch = 19, col = 'blue')

R Markdown

#From our initial PCA analysis using prcomp we got a good breakdown of variance by prinicpal component on the scree plot
#however the plot was a bit too clutered with the number of data points we had to really see good differences
#when moving on too LLE I looked at 10%, 20%, and 50% neighbor
#interestingly 20% looks far more like 50% than it does 10% showing how the data changed depending on how strict of parameters one uses
#in our kernal pca we see the non-linear separation of our data
#In this case I think the LLE plots showed the distinction and separation when the dimensions were reduced the best
#the prcomp didn't show enough distinction with the amount of data points it had, and kernal pca, while useful wasn't as specific in this case

#Question 4

head(dataset)

##      crim indus   nox    dis    rm medv
## 1 0.00632  2.31 0.538 4.0900 6.575 24.0
## 2 0.02731  7.07 0.469 4.9671 6.421 21.6
## 3 0.02729  7.07 0.469 4.9671 7.185 34.7
## 4 0.03237  2.18 0.458 6.0622 6.998 33.4
## 5 0.06905  2.18 0.458 6.0622 7.147 36.2
## 6 0.02985  2.18 0.458 6.0622 6.430 28.7

data3 = dataset[,2:3]
head(data3)

##   indus   nox
## 1  2.31 0.538
## 2  7.07 0.469
## 3  7.07 0.469
## 4  2.18 0.458
## 5  2.18 0.458
## 6  2.18 0.458

#now the dataset is just indus and nox


t.test(data3$indus,mu=11)

## 
##  One Sample t-test
## 
## data:  data3$indus
## t = 0.44848, df = 505, p-value = 0.654
## alternative hypothesis: true mean is not equal to 11
## 95 percent confidence interval:
##  10.53759 11.73596
## sample estimates:
## mean of x 
##  11.13678

#test whether indus mean = 11, p value hear is .654 which is far more than the minimum of .1 to accept so H0 is not rejected and mean is close to 11

#ks test

head(data3)

##   indus   nox
## 1  2.31 0.538
## 2  7.07 0.469
## 3  7.07 0.469
## 4  2.18 0.458
## 5  2.18 0.458
## 6  2.18 0.458

newx2 = (data3$indus -mean(data3$indus))/sd(data3$indus)
head(newx2)

## [1] -1.2866362 -0.5927944 -0.5927944 -1.3055857 -1.3055857 -1.3055857

summary(newx2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.5563 -0.8668 -0.2109  0.0000  1.0150  2.4202

newx3 = (data3$nox -mean(data3$nox))/sd(data3$nox)
head(newx3)

## [1] -0.1440749 -0.7395304 -0.7395304 -0.8344581 -0.8344581 -0.8344581

summary(newx3)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.4644 -0.9121 -0.1441  0.0000  0.5981  2.7296

hist(newx2)

hist(data3$indus)

ks.test(newx2,"pnorm")

## Warning in ks.test.default(newx2, "pnorm"): ties should not be present for the
## one-sample Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  newx2
## D = 0.21846, p-value < 2.2e-16
## alternative hypothesis: two-sided

#pvalue hear is 2.2e-10 reject h0 distribution is NOT normal


hist(newx3)

hist(data3$nox)

ks.test(newx3,"pnorm")

## Warning in ks.test.default(newx3, "pnorm"): ties should not be present for the
## one-sample Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  newx3
## D = 0.10552, p-value = 2.552e-05
## alternative hypothesis: two-sided

#pvalue is 2.55e-05 reject h0 distribution is not normal


#ad test
install.packages("nortest")

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'nortest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

library(nortest)
ad.test(newx2)

## 
##  Anderson-Darling normality test
## 
## data:  newx2
## A = 22.134, p-value < 2.2e-16

#p value is 2.2e-16 reject H0
ad.test(newx3)

## 
##  Anderson-Darling normality test
## 
## data:  newx3
## A = 8.3384, p-value < 2.2e-16

#p value is 2.2e-16 reject H0

#ksd
install.packages('KSD')

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'KSD' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpG8wKik\downloaded_packages

library(KSD)

## Warning: package 'KSD' was built under R version 4.4.3

X=newx2
scor=-X     ## the score-function of standard normal is equal to -X.

KSD.test=KSD(X, score_function=scor, kernel = "rbf", width = -1, nboot = 2000)
#KSD.test
KSD.test$p ## get the p-value

## [1] 0

#p value = 0

X=newx3
scor=-X     ## the score-function of standard normal is equal to -X.

KSD.test=KSD(X, score_function=scor, kernel = "rbf", width = -1, nboot = 2000)

KSD.test$p ## get the p-value

## [1] 0

#p value = 0

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

HW3

2025-07-27

R Markdown

Including Plots