Question 1

Read the flight data into R Use the b spline algorithm to develop a spline model with 5 degrees of freedom, with the column “speed” as the covariate, and the column “distance” as the target variable. Perform the following for your spline model: a) Calculate 70% confidence intervals for all data points. b) Create a scatter plot comparing plane speed and landing distance, and on this plot, display the confidence intervals and predicted values for all data points.

set.seed(1234)
# read flight data set in
library(readxl)
flightdata<-read_excel("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Test 1/flight_data.xlsx")

# what does the data look like?
str(flightdata)
## tibble [400 × 2] (S3: tbl_df/tbl/data.frame)
##  $ speed   : num [1:400] 27.7 29.2 33.8 34.1 34.2 ...
##  $ distance: num [1:400] 1324 1077 937 1198 956 ...
#What does the data look like plotted? 
#scatter plot
plot(flightdata$distance ~ flightdata$speed, xlab = "Speed", ylab = "Distance", main = "Scatterplot of Distance against Speed")

#plot is not linear but dips down and curves up
#load spline library to run spline model
library(splines)

#Change variables to X and Y to make them easier to work with
Y <- flightdata$distance
X <- flightdata$speed

#spline model with 5 degrees of freedom
flightdata_sp = lm(Y~bs(X, df = 5), data = flightdata)
summary(flightdata_sp)
## 
## Call:
## lm(formula = Y ~ bs(X, df = 5), data = flightdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -541.47 -122.79   -5.74  131.70  544.96 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1211.2      111.7  10.842  < 2e-16 ***
## bs(X, df = 5)1   -365.2      177.9  -2.052   0.0408 *  
## bs(X, df = 5)2   -494.2      104.6  -4.722 3.25e-06 ***
## bs(X, df = 5)3   1050.5      140.3   7.486 4.70e-13 ***
## bs(X, df = 5)4   3203.4      139.1  23.024  < 2e-16 ***
## bs(X, df = 5)5   5442.5      179.1  30.388  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194.2 on 394 degrees of freedom
## Multiple R-squared:  0.9637, Adjusted R-squared:  0.9632 
## F-statistic:  2090 on 5 and 394 DF,  p-value: < 2.2e-16
#a) Calculate 70% confidence intervals of all data points
flightdata_conf_inter <- predict(flightdata_sp, interval = "confidence", level = 0.70)
flightdata_conf_inter
##           fit       lwr       upr
## 1   1211.1619 1095.2248 1327.0990
## 2   1172.4360 1074.0206 1270.8513
## 3   1067.4686 1009.2992 1125.6380
## 4   1061.4940 1005.2173 1117.7706
## 5   1059.4027 1003.7762 1115.0293
## 6   1057.7753 1002.6502 1112.9005
## 7    987.6433  949.7622 1025.5245
## 8    966.1149  931.5559 1000.6740
## 9    965.5131  931.0329  999.9934
## 10   953.4851  920.4292  986.5410
## 11   951.7346  918.8628  984.6063
## 12   946.9374  914.5423  979.3325
## 13   943.9299  911.8137  976.0461
## 14   942.1182  910.1631  974.0734
## 15   937.4581  905.8949  969.0212
## 16   931.9344  900.7992  963.0696
## 17   931.0594  899.9888  962.1299
## 18   929.1353  898.2044  960.0663
## 19   925.8858  895.1835  956.5880
## 20   925.3933  894.7251  956.0616
## 21   923.7406  893.1856  954.2957
## 22   914.4049  884.4782  944.3316
## 23   912.4103  882.6210  942.1995
## 24   911.6770  881.9391  941.4148
## 25   909.1075  879.5549  938.6602
## 26   903.6303  874.5141  932.7464
## 27   902.4947  873.4799  931.5096
## 28   899.5013  870.7805  928.2221
## 29   898.3260  869.7344  926.9176
## 30   897.2415  868.7779  925.7050
## 31   896.3336  867.9851  924.6822
## 32   893.1243  865.2610  920.9875
## 33   891.5240  863.9736  919.0745
## 34   890.1536  862.9380  917.3692
## 35   889.1668  862.2586  916.0751
## 36   888.4741  861.8427  915.1056
## 37   887.2217  861.7741  912.6693
## 38   887.8116  863.2955  912.3276
## 39   888.4849  864.3995  912.5703
## 40   888.6514  864.6536  912.6492
## 41   888.9223  865.0572  912.7875
## 42   889.0497  865.2434  912.8560
## 43   889.1849  865.4389  912.9309
## 44   890.5044  867.2651  913.7436
## 45   891.8909  869.0836  914.6983
## 46   892.4449  869.7915  915.0983
## 47   892.7732  870.2071  915.3393
## 48   893.7960  871.4856  916.1064
## 49   894.7401  872.6473  916.8328
## 50   895.5826  873.6718  917.4935
## 51   896.3135  874.5522  918.0749
## 52   896.3711  874.6212  918.1209
## 53   897.0678  875.4536  918.6819
## 54   897.9052  876.4468  919.3636
## 55   899.0032  877.7381  920.2682
## 56   900.7223  879.7381  921.7064
## 57   901.0127  880.0737  921.9517
## 58   901.7558  880.9294  922.5822
## 59   903.3369  882.7372  923.9365
## 60   903.5747  883.0078  924.1416
## 61   904.8058  884.4028  925.2088
## 62   907.9404  887.9178  927.9630
## 63   909.5084  889.6585  929.3584
## 64   911.6681  892.0388  931.2973
## 65   912.7503  893.2248  932.2757
## 66   913.2253  893.7440  932.7065
## 67   916.0535  896.8192  935.2879
## 68   918.0916  899.0192  937.1640
## 69   920.4475  901.5472  939.3478
## 70   924.2746  905.6226  942.9266
## 71   930.6245  912.3100  948.9391
## 72   930.6734  912.3611  948.9857
## 73   932.8864  914.6723  951.1005
## 74   935.5143  917.4054  953.6232
## 75   936.8921  918.8337  954.9506
## 76   942.3407  924.4529  960.2285
## 77   944.9576  927.1368  962.7783
## 78   946.6150  928.8321  964.3979
## 79   948.4625  930.7178  966.2071
## 80   949.1487  931.4172  966.8802
## 81   951.8652  934.1808  969.5496
## 82   959.9494  942.3613  977.5375
## 83   960.2070  942.6210  977.7930
## 84   964.6725  947.1146  982.2304
## 85   965.8911  948.3383  983.4439
## 86   968.7431  951.1983  986.2879
## 87   969.5261  951.9825  987.0696
## 88   973.1692  955.6267  990.7117
## 89   975.2680  957.7228  992.8133
## 90   982.0737  964.5049  999.6425
## 91   982.8311  965.2584 1000.4038
## 92   983.1030  965.5289 1000.6771
## 93   983.3958  965.8202 1000.9715
## 94   986.4588  968.8647 1004.0529
## 95   991.4619  973.8311 1009.0927
## 96   992.2452  974.6080 1009.8824
## 97   992.6673  975.0266 1010.3080
## 98   992.9007  975.2580 1010.5434
## 99   996.0859  978.4151 1013.7567
## 100  997.5677  979.8830 1015.2523
## 101  998.0999  980.4102 1015.7897
## 102  998.5923  980.8978 1016.2868
## 103  999.9081  982.2007 1017.6156
## 104 1009.5295  991.7197 1027.3393
## 105 1013.8029  995.9445 1031.6613
## 106 1014.5582  996.6912 1032.4253
## 107 1023.0566 1005.0903 1041.0228
## 108 1024.1331 1006.1542 1042.1120
## 109 1024.2983 1006.3175 1042.2792
## 110 1024.6347 1006.6499 1042.6194
## 111 1024.6844 1006.6991 1042.6697
## 112 1025.1276 1007.1371 1043.1181
## 113 1025.2338 1007.2421 1043.2256
## 114 1027.3025 1009.2866 1045.3184
## 115 1030.7718 1012.7157 1048.8279
## 116 1031.9767 1013.9068 1050.0466
## 117 1036.2311 1018.1131 1054.3491
## 118 1036.8782 1018.7530 1055.0035
## 119 1038.3879 1020.2460 1056.5298
## 120 1040.8621 1022.6933 1059.0309
## 121 1041.9993 1023.8183 1060.1803
## 122 1043.5277 1025.3305 1061.7249
## 123 1050.1701 1031.9055 1068.4347
## 124 1055.1206 1036.8093 1073.4318
## 125 1055.4238 1037.1098 1073.7378
## 126 1056.0051 1037.6858 1074.3243
## 127 1069.5108 1051.0843 1087.9372
## 128 1074.8795 1056.4189 1093.3401
## 129 1079.9056 1061.4177 1098.3935
## 130 1082.7127 1064.2116 1101.2138
## 131 1091.1128 1072.5813 1109.6443
## 132 1091.4588 1072.9263 1109.9913
## 133 1103.7872 1085.2364 1122.3379
## 134 1111.7888 1093.2429 1130.3347
## 135 1120.2297 1101.7031 1138.7562
## 136 1127.7918 1109.2944 1146.2893
## 137 1130.8198 1112.3369 1149.3028
## 138 1136.6220 1118.1713 1155.0727
## 139 1137.5323 1119.0872 1155.9774
## 140 1137.7836 1119.3401 1156.2271
## 141 1141.3509 1122.9306 1159.7712
## 142 1141.8914 1123.4748 1160.3080
## 143 1145.0768 1126.6828 1163.4707
## 144 1146.3269 1127.9423 1164.7116
## 145 1149.3677 1131.0067 1167.7288
## 146 1149.6546 1131.2958 1168.0133
## 147 1150.1469 1131.7921 1168.5017
## 148 1151.8565 1133.5157 1170.1972
## 149 1158.1970 1139.9116 1176.4825
## 150 1167.1266 1148.9275 1185.3257
## 151 1174.7724 1156.6543 1192.8905
## 152 1178.6239 1160.5488 1196.6990
## 153 1179.5270 1161.4622 1197.5918
## 154 1183.4649 1165.4457 1201.4840
## 155 1187.1565 1169.1813 1205.1316
## 156 1193.6067 1175.7111 1211.5024
## 157 1195.8015 1177.9335 1213.6695
## 158 1196.8326 1178.9777 1214.6874
## 159 1201.4171 1183.6214 1219.2127
## 160 1202.3021 1184.5180 1220.0861
## 161 1203.3121 1185.5412 1221.0829
## 162 1211.4908 1193.8294 1229.1522
## 163 1213.5565 1195.9232 1231.1898
## 164 1220.4855 1202.9479 1238.0231
## 165 1221.2791 1203.7526 1238.8057
## 166 1222.3089 1204.7968 1239.8211
## 167 1222.4828 1204.9731 1239.9926
## 168 1227.7754 1210.3402 1245.2107
## 169 1233.5254 1216.1719 1250.8789
## 170 1245.2626 1228.0776 1262.4475
## 171 1250.3259 1233.2139 1267.4378
## 172 1252.5546 1235.4749 1269.6344
## 173 1262.7268 1245.7935 1279.6601
## 174 1263.2794 1246.3541 1280.2048
## 175 1263.9813 1247.0661 1280.8966
## 176 1267.4552 1250.5896 1284.3207
## 177 1268.8302 1251.9843 1285.6761
## 178 1275.5823 1258.8323 1292.3324
## 179 1280.9700 1264.2957 1297.6443
## 180 1296.9557 1280.5007 1313.4106
## 181 1297.0129 1280.5587 1313.4671
## 182 1303.4020 1287.0327 1319.7713
## 183 1307.0152 1290.6931 1323.3374
## 184 1312.4072 1296.1543 1328.6601
## 185 1324.1460 1308.0385 1340.2536
## 186 1329.2190 1313.1717 1345.2663
## 187 1339.3750 1323.4436 1355.3064
## 188 1343.9519 1328.0704 1359.8333
## 189 1358.3201 1342.5858 1374.0545
## 190 1358.8500 1343.1208 1374.5792
## 191 1358.9272 1343.1988 1374.6557
## 192 1375.6998 1360.1223 1391.2772
## 193 1376.7390 1361.1701 1392.3079
## 194 1392.9358 1377.4894 1408.3821
## 195 1397.5125 1382.0967 1412.9283
## 196 1410.2689 1394.9287 1425.6091
## 197 1415.1756 1399.8606 1430.4906
## 198 1423.4326 1408.1553 1438.7099
## 199 1426.7138 1411.4498 1441.9778
## 200 1427.2047 1411.9426 1442.4668
## 201 1441.1065 1425.8895 1456.3236
## 202 1442.6841 1427.4711 1457.8971
## 203 1446.9871 1431.7840 1462.1903
## 204 1450.1854 1434.9886 1465.3823
## 205 1454.5901 1439.4004 1469.7798
## 206 1467.1886 1452.0103 1482.3670
## 207 1470.4595 1455.2819 1485.6372
## 208 1473.7050 1458.5272 1488.8828
## 209 1473.7749 1458.5971 1488.9527
## 210 1482.4560 1467.2734 1497.6386
## 211 1487.1618 1471.9740 1502.3496
## 212 1487.6783 1472.4898 1502.8668
## 213 1495.4332 1480.2320 1510.6344
## 214 1498.3004 1483.0933 1513.5076
## 215 1504.0993 1488.8783 1519.3202
## 216 1512.7282 1497.4819 1527.9744
## 217 1514.5746 1499.3223 1529.8270
## 218 1515.3459 1500.0910 1530.6009
## 219 1515.4832 1500.2278 1530.7387
## 220 1516.5968 1501.3375 1531.8561
## 221 1517.1910 1501.9295 1532.4524
## 222 1529.9323 1514.6198 1545.2447
## 223 1538.5240 1523.1710 1553.8769
## 224 1541.6650 1526.2960 1557.0340
## 225 1565.7534 1550.2426 1581.2642
## 226 1566.6224 1551.1059 1582.1389
## 227 1567.8744 1552.3495 1583.3992
## 228 1597.2266 1581.4862 1612.9669
## 229 1600.5025 1584.7358 1616.2692
## 230 1610.2859 1594.4384 1626.1335
## 231 1612.6194 1596.7521 1628.4867
## 232 1616.1131 1600.2159 1632.0103
## 233 1644.7069 1628.5526 1660.8612
## 234 1645.8564 1629.6914 1662.0214
## 235 1658.9868 1642.6978 1675.2757
## 236 1659.5638 1643.2693 1675.8583
## 237 1659.7860 1643.4894 1676.0826
## 238 1663.3240 1646.9936 1679.6545
## 239 1664.2895 1647.9498 1680.6292
## 240 1685.5347 1668.9891 1702.0803
## 241 1689.5712 1672.9862 1706.1562
## 242 1689.9821 1673.3931 1706.5712
## 243 1697.9084 1681.2420 1714.5749
## 244 1707.8040 1691.0410 1724.5670
## 245 1709.6098 1692.8293 1726.3904
## 246 1722.0675 1705.1661 1738.9689
## 247 1724.4944 1707.5696 1741.4191
## 248 1727.1543 1710.2040 1744.1047
## 249 1734.4021 1717.3824 1751.4217
## 250 1745.2907 1728.1682 1762.4133
## 251 1752.1779 1734.9911 1769.3646
## 252 1753.3855 1736.1876 1770.5834
## 253 1757.2831 1740.0494 1774.5169
## 254 1759.1058 1741.8553 1776.3562
## 255 1761.4145 1744.1430 1778.6859
## 256 1770.8932 1753.5364 1788.2499
## 257 1775.2380 1757.8429 1792.6332
## 258 1777.1541 1759.7421 1794.5661
## 259 1796.5967 1779.0192 1814.1743
## 260 1799.7933 1782.1895 1817.3971
## 261 1801.8982 1784.2773 1819.5192
## 262 1805.4925 1787.8426 1823.1424
## 263 1816.5415 1798.8050 1834.2781
## 264 1822.1642 1804.3849 1839.9434
## 265 1823.7654 1805.9742 1841.5566
## 266 1824.5042 1806.7075 1842.3009
## 267 1872.7377 1854.6239 1890.8515
## 268 1879.2950 1861.1448 1897.4452
## 269 1890.9392 1872.7286 1909.1499
## 270 1912.3610 1894.0525 1930.6695
## 271 1916.0770 1897.7533 1934.4007
## 272 1922.6819 1904.3322 1941.0315
## 273 1923.6421 1905.2889 1941.9954
## 274 1937.4639 1919.0619 1955.8659
## 275 1939.0853 1920.6780 1957.4925
## 276 1969.4533 1950.9628 1987.9438
## 277 1969.4707 1950.9802 1987.9612
## 278 1984.5592 1966.0380 2003.0805
## 279 1992.8913 1974.3559 2011.4267
## 280 1996.1953 1977.6549 2014.7358
## 281 2007.4052 1988.8497 2025.9607
## 282 2010.0432 1991.4847 2028.6018
## 283 2014.4842 1995.9209 2033.0476
## 284 2015.7585 1997.1939 2034.3231
## 285 2024.0456 2005.4737 2042.6175
## 286 2042.6167 2024.0343 2061.1992
## 287 2049.5739 2030.9896 2068.1583
## 288 2052.8317 2034.2467 2071.4167
## 289 2065.6891 2047.1038 2084.2743
## 290 2065.8089 2047.2236 2084.3941
## 291 2076.0466 2057.4635 2094.6298
## 292 2082.3731 2063.7921 2100.9541
## 293 2082.5666 2063.9856 2101.1475
## 294 2091.0744 2072.4974 2109.6514
## 295 2094.4717 2075.8966 2113.0467
## 296 2098.8221 2080.2497 2117.3945
## 297 2117.1253 2098.5671 2135.6834
## 298 2119.0599 2100.5035 2137.6163
## 299 2135.0881 2116.5478 2153.6284
## 300 2140.4090 2121.8747 2158.9433
## 301 2149.7599 2131.2367 2168.2832
## 302 2157.2133 2138.6994 2175.7272
## 303 2164.2363 2145.7316 2182.7410
## 304 2175.1980 2156.7082 2193.6878
## 305 2179.8993 2161.4160 2198.3825
## 306 2186.5107 2168.0370 2204.9845
## 307 2187.7111 2169.2391 2206.1832
## 308 2194.9307 2176.4692 2213.3922
## 309 2204.5618 2186.1145 2223.0090
## 310 2208.5977 2190.1565 2227.0390
## 311 2216.6289 2198.1997 2235.0581
## 312 2226.0737 2207.6587 2244.4887
## 313 2231.3707 2212.9636 2249.7777
## 314 2234.1767 2215.7738 2252.5795
## 315 2234.5434 2216.1411 2252.9457
## 316 2249.1168 2230.7360 2267.4976
## 317 2255.6727 2237.3013 2274.0440
## 318 2271.5607 2253.2115 2289.9099
## 319 2281.0034 2262.6667 2299.3400
## 320 2299.1356 2280.8213 2317.4499
## 321 2309.2163 2290.9133 2327.5194
## 322 2315.2371 2296.9404 2333.5338
## 323 2337.8975 2319.6212 2356.1738
## 324 2341.4471 2323.1734 2359.7207
## 325 2347.1534 2328.8837 2365.4230
## 326 2382.8031 2364.5485 2401.0577
## 327 2435.3409 2417.0723 2453.6094
## 328 2457.0773 2438.7885 2475.3661
## 329 2502.4651 2484.1035 2520.8266
## 330 2504.2026 2485.8374 2522.5677
## 331 2505.1221 2486.7550 2523.4892
## 332 2559.6036 2541.0875 2578.1196
## 333 2569.7509 2551.1997 2588.3020
## 334 2584.2249 2565.6196 2602.8302
## 335 2590.8242 2572.1927 2609.4558
## 336 2617.1260 2598.3798 2635.8722
## 337 2639.4510 2620.5951 2658.3070
## 338 2646.5297 2627.6366 2665.4228
## 339 2668.8137 2649.7961 2687.8312
## 340 2701.2908 2682.0723 2720.5094
## 341 2729.4017 2709.9907 2748.8126
## 342 2735.3625 2715.9086 2754.8164
## 343 2736.8153 2717.3508 2756.2797
## 344 2744.8869 2725.3629 2764.4109
## 345 2773.7232 2753.9759 2793.4705
## 346 2809.3931 2789.3476 2829.4387
## 347 2832.8582 2812.6040 2853.1125
## 348 2920.0608 2898.9549 2941.1667
## 349 2939.8471 2918.5334 2961.1608
## 350 2967.8333 2946.2178 2989.4488
## 351 2978.8479 2957.1114 3000.5845
## 352 2998.3729 2976.4188 3020.3270
## 353 3001.1933 2979.2076 3023.1791
## 354 3046.3445 3023.8420 3068.8469
## 355 3048.9207 3026.3883 3071.4531
## 356 3080.1273 3057.2290 3103.0257
## 357 3123.7216 3100.3035 3147.1398
## 358 3129.2648 3105.7799 3152.7496
## 359 3144.1091 3120.4453 3167.7729
## 360 3154.0328 3130.2490 3177.8167
## 361 3159.7841 3135.9306 3183.6376
## 362 3173.3051 3149.2875 3197.3226
## 363 3198.3112 3173.9895 3222.6330
## 364 3222.4982 3197.8818 3247.1147
## 365 3228.1590 3203.4736 3252.8445
## 366 3240.0340 3215.2039 3264.8641
## 367 3252.8637 3227.8773 3277.8500
## 368 3310.2881 3284.6052 3335.9711
## 369 3354.9152 3328.6964 3381.1339
## 370 3405.3885 3378.5725 3432.2044
## 371 3476.3661 3448.7311 3504.0012
## 372 3488.0445 3460.2774 3515.8117
## 373 3494.3783 3466.5398 3522.2167
## 374 3552.9379 3524.4524 3581.4235
## 375 3571.4390 3542.7537 3600.1243
## 376 3651.7959 3622.2710 3681.3207
## 377 3727.4207 3697.1485 3757.6928
## 378 3734.0565 3703.7208 3764.3922
## 379 3736.9189 3706.5559 3767.2820
## 380 3763.2712 3732.6593 3793.8830
## 381 3780.0442 3749.2766 3810.8118
## 382 3971.7498 3939.3327 4004.1669
## 383 3982.9112 3950.4046 4015.4179
## 384 4069.1328 4035.9507 4102.3148
## 385 4173.0696 4139.0981 4207.0412
## 386 4190.2802 4156.1786 4224.3819
## 387 4234.0701 4199.6361 4268.5041
## 388 4260.3673 4225.7319 4295.0027
## 389 4262.6372 4227.9843 4297.2901
## 390 4473.9336 4437.5185 4510.3488
## 391 4565.8056 4528.4798 4603.1313
## 392 4687.4681 4648.7158 4726.2205
## 393 4750.3538 4710.7392 4789.9684
## 394 4969.5808 4926.0990 5013.0626
## 395 4987.6054 4943.7369 5031.4740
## 396 5248.9184 5198.0616 5299.7751
## 397 5253.5425 5202.5373 5304.5476
## 398 5642.3197 5575.5448 5709.0947
## 399 6094.8633 6001.5138 6188.2128
## 400 6653.6284 6515.9397 6791.3172
#b) Scatter plot comparing Speed and Distance to confidence intervals
plot(X, Y, xlab = "Speed", main = "Regression Spline with df = 5", ylab = "Distance", cex = .5)
lines(X, flightdata_sp$fitted.values, col = "blue", lwd = 1)
flightdata_conf_inter <- predict(flightdata_sp, interval = "confidence", level = 0.70)
lines(X, flightdata_conf_inter[,2], col = "red", lty = 2)
lines(X, flightdata_conf_inter[,3], col = "red", lty = 2)

Question 2

Using the same dataset as in the previous question, now create a natural cubic spline model with 3 degrees of freedom. As before, use “speed” as the covariate and “distance” as the target variable. Use ANOVA to compare this model with the one you developed in the previous question. Which of the models has the smallest sum of squared errors? Based on your results, do you believe there is a statistically significant difference between the predictive performance of the two models? If so, which model do you think would be better to use for predicting landing distance? Explain your reasoning.

#Natural Spline 3 degrees of freedom
ns_flightdata = lm(Y~ns(X, df = 3), data = flightdata)
summary(ns_flightdata)
## 
## Call:
## lm(formula = Y ~ ns(X, df = 3), data = flightdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -564.04 -121.52   -3.54  130.88  541.75 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1069.81      59.34   18.03   <2e-16 ***
## ns(X, df = 3)1  1266.40      44.35   28.55   <2e-16 ***
## ns(X, df = 3)2  3236.23     137.77   23.49   <2e-16 ***
## ns(X, df = 3)3  5494.25      75.91   72.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 195.6 on 396 degrees of freedom
## Multiple R-squared:  0.9629, Adjusted R-squared:  0.9626 
## F-statistic:  3429 on 3 and 396 DF,  p-value: < 2.2e-16
# Compare this model to the model in question 1 using ANOVA
anova(flightdata_sp, ns_flightdata)
## Analysis of Variance Table
## 
## Model 1: Y ~ bs(X, df = 5)
## Model 2: Y ~ ns(X, df = 3)
##   Res.Df      RSS Df Sum of Sq      F  Pr(>F)  
## 1    394 14851546                              
## 2    396 15155277 -2   -303730 4.0289 0.01853 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The bs model has the lowest sum of squared errors at 14851546 and is statistically significant in that the p value for the two models is below 0.05 so model 1 as captured in the ANOVA/bs model should be used to predict landing distance.

Question 3

Load the health_dataset csv file into R. This data contains various pieces of health information about some unidentified individuals. Randomly separate the rows of this data into training and testing sets. Use 50% of the data as your training set, and use the rest of the data as your testing set.

# set seed to always randomly sample same numbers
set.seed(1234)
# read health data set in
health <- read.csv("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Test 1/health_dataset.csv")
# what does the data look like?
str(health)
## 'data.frame':    2000 obs. of  20 variables:
##  $ Gender      : chr  "female" "female" "female" "female" ...
##  $ Age         : int  49 45 45 45 66 54 50 16 56 56 ...
##  $ Weight      : num  86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
##  $ Height      : num  168 167 167 167 170 ...
##  $ BMI         : num  30.6 27.2 27.2 27.2 23.7 ...
##  $ Pulse       : int  86 62 62 62 60 76 74 76 64 64 ...
##  $ BPSysAve    : int  112 118 118 118 111 134 142 126 95 95 ...
##  $ BPDiaAve    : int  75 64 64 64 63 85 68 72 69 69 ...
##  $ BPSys1      : int  118 106 106 106 124 136 138 132 94 94 ...
##  $ BPDia1      : int  82 62 62 62 64 86 66 74 74 74 ...
##  $ BPSys2      : int  108 118 118 118 108 132 142 126 94 94 ...
##  $ BPDia2      : int  74 68 68 68 62 88 74 68 70 70 ...
##  $ BPSys3      : int  116 118 118 118 114 136 142 126 96 96 ...
##  $ BPDia3      : int  76 60 60 60 64 82 62 76 68 68 ...
##  $ DirectChol  : num  1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
##  $ TotChol     : num  6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
##  $ UrineVol1   : int  77 106 106 106 113 215 64 345 26 26 ...
##  $ UrineFlow1  : num  0.094 1.116 1.116 1.116 0.489 ...
##  $ Diabetes    : chr  "No" "No" "No" "No" ...
##  $ SleepTrouble: chr  "Yes" "No" "No" "No" ...
# 20 variables some numbers, some characters, and some integers

# separate into 50% training and testing data set
sample_health <- sample(nrow(health), nrow(health)*0.50)
health_train <- health[sample_health,]
health_test <- health[-sample_health,]

a) Using “Diabetes” as your target variable and all other columns as covariates, develop a decision tree, a random forest with 100 trees, and a boosting model. In developing the boosting model, you can either use R’s default settings for things like the number of trees and shrinkage, or you can use values of your own choosing. For each of your models, display a confusion matrix showing the number of people who are misclassified in the testing set. Which model misclassifies the smallest number of people in the testing set?

#decision tree (classification tree) for character variable Diabetes

#load library for decsion trees
library(rpart)
library(rpart.plot)

#descision tree model
health_rpart1 <- rpart(formula = Diabetes ~., data = health_train, method = "class")
pred1 <- predict(health_rpart1, health_test, type = "class")
pred1
##    1    2    3    5    7    8    9   12   14   15   16   17   18   20   21   22 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##   24   26   28   29   31   33   36   37   38   44   46   47   50   52   56   62 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##   64   68   69   70   72   73   74   75   76   77   78   80   81   83   84   85 
##   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No   No 
##   87   89   90   92   93   94   99  101  104  105  106  108  110  111  112  113 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  114  115  117  119  120  121  122  129  130  133  134  139  140  143  144  146 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes 
##  147  148  149  150  151  152  154  156  157  158  159  167  172  173  175  179 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  180  185  186  187  188  190  192  193  195  196  197  198  200  209  210  211 
##  Yes   No   No   No   No   No  Yes   No   No   No   No  Yes   No   No   No   No 
##  212  213  217  218  219  220  221  227  229  230  231  232  233  234  236  238 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  239  240  242  243  244  246  247  249  252  253  255  256  257  260  261  264 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  265  266  267  268  271  272  273  274  275  277  282  285  287  288  289  290 
##   No   No   No   No   No   No  Yes   No   No  Yes   No   No   No   No   No   No 
##  291  295  297  299  300  301  302  304  306  310  313  314  315  316  317  320 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  322  323  324  325  327  329  330  332  336  337  339  342  343  345  351  353 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  355  356  358  359  360  362  363  364  370  372  374  375  376  377  378  381 
##   No   No   No   No   No   No   No   No   No   No   No   No  Yes  Yes   No   No 
##  384  385  387  391  392  393  394  397  398  401  403  404  407  408  409  410 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  411  412  414  415  416  418  419  423  424  425  426  428  430  433  435  437 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  438  439  441  443  444  445  446  447  450  451  452  455  458  466  468  470 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  472  481  483  490  492  494  496  497  498  500  502  503  505  506  507  509 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No 
##  512  513  516  517  521  527  528  531  536  538  539  544  550  551  553  555 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  556  557  558  559  560  564  565  566  567  568  571  575  579  581  582  584 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  586  591  592  593  594  596  599  610  611  612  616  619  620  621  624  628 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  631  632  636  640  641  643  644  649  650  651  654  656  657  658  660  661 
##   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No   No   No 
##  662  663  666  671  672  673  675  676  678  682  684  685  686  690  691  692 
##   No   No   No  Yes   No   No   No   No   No   No   No   No   No   No   No   No 
##  695  701  705  706  707  710  712  713  714  717  718  719  720  721  722  723 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  728  732  734  735  736  738  739  743  744  745  747  748  749  750  751  753 
##   No   No   No   No   No  Yes  Yes   No   No   No   No   No   No   No   No   No 
##  756  758  759  761  762  763  764  766  767  769  774  777  781  783  785  787 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  794  795  796  797  798  799  800  805  809  812  814  817  820  821  825  826 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  827  829  832  837  839  842  844  845  846  847  850  852  855  856  857  858 
##   No   No   No   No   No   No   No   No   No   No  Yes  Yes   No   No   No   No 
##  859  862  863  867  871  872  873  876  878  879  880  881  882  887  888  890 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  892  899  904  907  910  914  915  923  925  927  932  933  935  941  942  943 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  944  945  948  949  954  955  958  960  961  964  965  966  968  969  974  975 
##   No   No   No   No  Yes   No   No   No   No   No   No   No   No   No   No   No 
##  977  978  981  983  985  987  990  992  994  997  999 1001 1002 1005 1006 1007 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1013 1015 1016 1019 1020 1021 1022 1023 1026 1027 1029 1031 1032 1033 1034 1035 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No 
## 1037 1038 1040 1042 1044 1047 1048 1049 1050 1052 1053 1057 1059 1063 1064 1066 
##   No   No   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No 
## 1067 1068 1069 1070 1071 1076 1078 1080 1082 1083 1085 1086 1089 1090 1091 1093 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1096 1097 1102 1105 1112 1113 1114 1115 1116 1117 1120 1122 1123 1126 1130 1133 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1134 1135 1137 1139 1141 1145 1149 1152 1153 1156 1157 1158 1161 1162 1163 1164 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1168 1170 1172 1177 1178 1180 1181 1182 1186 1187 1189 1191 1193 1196 1197 1199 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1200 1201 1204 1205 1207 1210 1214 1215 1217 1220 1221 1223 1224 1227 1229 1230 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1232 1233 1237 1240 1243 1244 1246 1249 1250 1252 1253 1257 1263 1265 1266 1268 
##  Yes   No   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No 
## 1269 1270 1272 1278 1279 1280 1282 1283 1285 1286 1288 1290 1291 1292 1293 1294 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1300 1303 1304 1306 1307 1310 1312 1313 1314 1316 1317 1319 1320 1322 1323 1324 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1325 1326 1327 1334 1338 1339 1340 1341 1342 1344 1346 1347 1351 1353 1354 1355 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1356 1357 1360 1363 1370 1372 1375 1376 1377 1378 1379 1380 1383 1384 1388 1391 
##   No   No   No   No   No   No   No  Yes  Yes   No   No   No   No   No   No   No 
## 1393 1397 1398 1401 1404 1405 1408 1410 1411 1412 1413 1414 1415 1416 1417 1418 
##   No   No  Yes   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1420 1425 1426 1430 1431 1432 1434 1435 1437 1439 1440 1441 1442 1444 1447 1449 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes 
## 1450 1451 1453 1455 1457 1459 1461 1462 1464 1466 1467 1469 1471 1475 1477 1481 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1484 1486 1488 1489 1496 1497 1500 1504 1507 1509 1511 1513 1514 1516 1517 1518 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1520 1522 1523 1527 1528 1529 1530 1533 1537 1538 1540 1541 1543 1544 1547 1549 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1550 1551 1554 1555 1556 1559 1564 1566 1567 1568 1569 1570 1571 1572 1573 1574 
##   No   No   No   No   No   No   No   No   No   No   No  Yes   No   No   No   No 
## 1576 1582 1583 1587 1589 1591 1593 1596 1598 1599 1600 1603 1606 1607 1608 1609 
##   No   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No   No 
## 1610 1611 1612 1615 1625 1631 1637 1638 1639 1640 1641 1642 1644 1645 1646 1647 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1648 1649 1652 1654 1656 1657 1658 1661 1663 1665 1668 1669 1670 1674 1675 1676 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1677 1678 1681 1683 1686 1687 1688 1689 1690 1692 1693 1696 1697 1698 1700 1707 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No 
## 1708 1710 1712 1713 1714 1717 1718 1719 1720 1722 1724 1726 1727 1728 1729 1730 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1733 1734 1735 1737 1738 1742 1743 1747 1748 1750 1755 1757 1758 1759 1763 1764 
##   No  Yes   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1765 1767 1769 1770 1771 1773 1775 1776 1777 1778 1780 1781 1783 1784 1786 1787 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1790 1791 1793 1794 1795 1797 1799 1801 1805 1810 1812 1813 1815 1816 1817 1819 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1825 1827 1831 1833 1836 1837 1838 1839 1844 1845 1846 1847 1852 1856 1857 1859 
##   No   No   No   No   No   No  Yes  Yes   No   No   No   No   No   No   No   No 
## 1860 1861 1865 1866 1868 1869 1871 1873 1874 1876 1878 1879 1881 1882 1885 1886 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1887 1891 1892 1893 1896 1899 1903 1906 1907 1909 1911 1914 1916 1917 1919 1921 
##   No   No   No   No   No   No   No   No   No  Yes   No   No   No   No   No   No 
## 1923 1927 1929 1931 1932 1933 1938 1941 1944 1948 1949 1950 1951 1952 1954 1958 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1961 1963 1967 1969 1970 1971 1973 1974 1975 1977 1978 1979 1981 1983 1985 1988 
##   No  Yes   No   No  Yes   No   No   No   No   No   No   No   No   No   No   No 
## 1989 1991 1993 1994 1995 1996 1998 1999 
##   No   No   No   No   No   No   No   No 
## Levels: No Yes
#confusion matrix
table(health_test$Diabetes, pred1, dnn = c("True", "Pred"))
##      Pred
## True   No Yes
##   No  894  17
##   Yes  72  17
sum(health_test$Diabete != pred1) #number misclassified in testing set
## [1] 89
    # Pred
#True   No Yes
#  No  894  17
#  Yes  72  17

#total misclassified 89
set.seed(1234)
# random forest
#install.packages('randomForest')
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
#random forest model
health_train$Diabetes <- as.factor(health_train$Diabetes)
health_test$Diabetes <- as.factor(health_test$Diabetes)
rf_health <- randomForest(Diabetes ~., data = health_train, importance = TRUE, ntree = 100)
rf_pred1 <- predict(rf_health, health_test)
rf_pred1
##    1    2    3    5    7    8    9   12   14   15   16   17   18   20   21   22 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##   24   26   28   29   31   33   36   37   38   44   46   47   50   52   56   62 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##   64   68   69   70   72   73   74   75   76   77   78   80   81   83   84   85 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##   87   89   90   92   93   94   99  101  104  105  106  108  110  111  112  113 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  114  115  117  119  120  121  122  129  130  133  134  139  140  143  144  146 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes 
##  147  148  149  150  151  152  154  156  157  158  159  167  172  173  175  179 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  180  185  186  187  188  190  192  193  195  196  197  198  200  209  210  211 
##  Yes   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  212  213  217  218  219  220  221  227  229  230  231  232  233  234  236  238 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  239  240  242  243  244  246  247  249  252  253  255  256  257  260  261  264 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  265  266  267  268  271  272  273  274  275  277  282  285  287  288  289  290 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  291  295  297  299  300  301  302  304  306  310  313  314  315  316  317  320 
##   No   No   No   No   No   No  Yes  Yes   No   No   No   No   No   No   No   No 
##  322  323  324  325  327  329  330  332  336  337  339  342  343  345  351  353 
##   No   No   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No 
##  355  356  358  359  360  362  363  364  370  372  374  375  376  377  378  381 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  384  385  387  391  392  393  394  397  398  401  403  404  407  408  409  410 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  411  412  414  415  416  418  419  423  424  425  426  428  430  433  435  437 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  438  439  441  443  444  445  446  447  450  451  452  455  458  466  468  470 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  472  481  483  490  492  494  496  497  498  500  502  503  505  506  507  509 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  512  513  516  517  521  527  528  531  536  538  539  544  550  551  553  555 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  556  557  558  559  560  564  565  566  567  568  571  575  579  581  582  584 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  586  591  592  593  594  596  599  610  611  612  616  619  620  621  624  628 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  631  632  636  640  641  643  644  649  650  651  654  656  657  658  660  661 
##   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No   No   No 
##  662  663  666  671  672  673  675  676  678  682  684  685  686  690  691  692 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  695  701  705  706  707  710  712  713  714  717  718  719  720  721  722  723 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  728  732  734  735  736  738  739  743  744  745  747  748  749  750  751  753 
##   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No   No   No 
##  756  758  759  761  762  763  764  766  767  769  774  777  781  783  785  787 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  794  795  796  797  798  799  800  805  809  812  814  817  820  821  825  826 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  827  829  832  837  839  842  844  845  846  847  850  852  855  856  857  858 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  859  862  863  867  871  872  873  876  878  879  880  881  882  887  888  890 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No 
##  892  899  904  907  910  914  915  923  925  927  932  933  935  941  942  943 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
##  944  945  948  949  954  955  958  960  961  964  965  966  968  969  974  975 
##   No   No   No   No  Yes   No   No   No   No   No   No   No   No   No   No   No 
##  977  978  981  983  985  987  990  992  994  997  999 1001 1002 1005 1006 1007 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1013 1015 1016 1019 1020 1021 1022 1023 1026 1027 1029 1031 1032 1033 1034 1035 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1037 1038 1040 1042 1044 1047 1048 1049 1050 1052 1053 1057 1059 1063 1064 1066 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1067 1068 1069 1070 1071 1076 1078 1080 1082 1083 1085 1086 1089 1090 1091 1093 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1096 1097 1102 1105 1112 1113 1114 1115 1116 1117 1120 1122 1123 1126 1130 1133 
##   No   No   No   No   No   No   No   No   No   No   No  Yes  Yes   No   No   No 
## 1134 1135 1137 1139 1141 1145 1149 1152 1153 1156 1157 1158 1161 1162 1163 1164 
##   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No   No   No 
## 1168 1170 1172 1177 1178 1180 1181 1182 1186 1187 1189 1191 1193 1196 1197 1199 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1200 1201 1204 1205 1207 1210 1214 1215 1217 1220 1221 1223 1224 1227 1229 1230 
##   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes   No   No 
## 1232 1233 1237 1240 1243 1244 1246 1249 1250 1252 1253 1257 1263 1265 1266 1268 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1269 1270 1272 1278 1279 1280 1282 1283 1285 1286 1288 1290 1291 1292 1293 1294 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1300 1303 1304 1306 1307 1310 1312 1313 1314 1316 1317 1319 1320 1322 1323 1324 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1325 1326 1327 1334 1338 1339 1340 1341 1342 1344 1346 1347 1351 1353 1354 1355 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1356 1357 1360 1363 1370 1372 1375 1376 1377 1378 1379 1380 1383 1384 1388 1391 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1393 1397 1398 1401 1404 1405 1408 1410 1411 1412 1413 1414 1415 1416 1417 1418 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1420 1425 1426 1430 1431 1432 1434 1435 1437 1439 1440 1441 1442 1444 1447 1449 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1450 1451 1453 1455 1457 1459 1461 1462 1464 1466 1467 1469 1471 1475 1477 1481 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes 
## 1484 1486 1488 1489 1496 1497 1500 1504 1507 1509 1511 1513 1514 1516 1517 1518 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1520 1522 1523 1527 1528 1529 1530 1533 1537 1538 1540 1541 1543 1544 1547 1549 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1550 1551 1554 1555 1556 1559 1564 1566 1567 1568 1569 1570 1571 1572 1573 1574 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1576 1582 1583 1587 1589 1591 1593 1596 1598 1599 1600 1603 1606 1607 1608 1609 
##   No   No   No   No   No   No   No  Yes   No   No   No   No   No   No   No   No 
## 1610 1611 1612 1615 1625 1631 1637 1638 1639 1640 1641 1642 1644 1645 1646 1647 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1648 1649 1652 1654 1656 1657 1658 1661 1663 1665 1668 1669 1670 1674 1675 1676 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1677 1678 1681 1683 1686 1687 1688 1689 1690 1692 1693 1696 1697 1698 1700 1707 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1708 1710 1712 1713 1714 1717 1718 1719 1720 1722 1724 1726 1727 1728 1729 1730 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1733 1734 1735 1737 1738 1742 1743 1747 1748 1750 1755 1757 1758 1759 1763 1764 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1765 1767 1769 1770 1771 1773 1775 1776 1777 1778 1780 1781 1783 1784 1786 1787 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1790 1791 1793 1794 1795 1797 1799 1801 1805 1810 1812 1813 1815 1816 1817 1819 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1825 1827 1831 1833 1836 1837 1838 1839 1844 1845 1846 1847 1852 1856 1857 1859 
##   No   No   No   No   No   No  Yes  Yes   No   No   No   No   No   No   No   No 
## 1860 1861 1865 1866 1868 1869 1871 1873 1874 1876 1878 1879 1881 1882 1885 1886 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1887 1891 1892 1893 1896 1899 1903 1906 1907 1909 1911 1914 1916 1917 1919 1921 
##   No   No   No   No   No   No   No   No   No   No   No   No   No  Yes  Yes   No 
## 1923 1927 1929 1931 1932 1933 1938 1941 1944 1948 1949 1950 1951 1952 1954 1958 
##   No   No   No  Yes  Yes  Yes   No   No   No   No   No   No   No   No   No   No 
## 1961 1963 1967 1969 1970 1971 1973 1974 1975 1977 1978 1979 1981 1983 1985 1988 
##   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No   No 
## 1989 1991 1993 1994 1995 1996 1998 1999 
##   No   No   No   No   No   No   No   No 
## Levels: No Yes
# confusion matrix
table(health_test$Diabetes, rf_pred1, dnn = c("True", "Pred"))
##      Pred
## True   No Yes
##   No  911   0
##   Yes  67  22
     #Pred
#True   No Yes
 # No  907   4
#  Yes  67  22
  
sum(health_test$Diabetes != rf_pred1)
## [1] 67
# total misclassified 67
set.seed(1234)
# boosting model
#install.packages("ggplot2")
#install.packages("caret")
#install.packages("adabag")
library(adabag)
## Loading required package: caret
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
#boosting model
health_train$Diabetes <- as.factor(health_train$Diabetes)
health_test$Diabetes <- as.factor(health_test$Diabetes)
health_boost = boosting(Diabetes~., data = health_train, boos = T)

#variable of importance
health_boost$importance
##          Age          BMI       BPDia1       BPDia2       BPDia3     BPDiaAve 
##    9.7224818    7.3471566    4.2315529    2.2132480    2.6707419    4.0858637 
##       BPSys1       BPSys2       BPSys3     BPSysAve   DirectChol       Gender 
##    2.7075574    3.8308598    4.1279362    4.8625001    6.7513092    0.8828666 
##       Height        Pulse SleepTrouble      TotChol   UrineFlow1    UrineVol1 
##    6.5782872    6.5247443    1.0706133    9.7997150    6.5167752    6.8937461 
##       Weight 
##    9.1820448
boost_pred1 = predict(health_boost, newdata = health_test)
boost_pred1$class
##    [1] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [13] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [25] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [37] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [49] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [61] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [73] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No" 
##   [85] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   [97] "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [109] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [121] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [133] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [145] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [157] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "Yes"
##  [169] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [181] "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [193] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [205] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [217] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [229] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [241] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [253] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [265] "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No" 
##  [277] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [289] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [301] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [313] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [325] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No" 
##  [337] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [349] "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [361] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [373] "No"  "Yes" "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [385] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [397] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [409] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [421] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [433] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [445] "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [457] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [469] "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No" 
##  [481] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [493] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [505] "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No" 
##  [517] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [529] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [541] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [553] "No"  "No"  "No"  "Yes" "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [565] "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No" 
##  [577] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [589] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [601] "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No" 
##  [613] "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [625] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [637] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [649] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [661] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No" 
##  [673] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "Yes" "No"  "No"  "No" 
##  [685] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [697] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [709] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [721] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [733] "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [745] "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No" 
##  [757] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [769] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [781] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes"
##  [793] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [805] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [817] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [829] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [841] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [853] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [865] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [877] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [889] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [901] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [913] "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "Yes" "No"  "No"  "No"  "No" 
##  [925] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [937] "No"  "No"  "Yes" "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes"
##  [949] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "Yes" "No" 
##  [961] "No"  "No"  "No"  "Yes" "Yes" "Yes" "No"  "No"  "No"  "No"  "No"  "No" 
##  [973] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "No" 
##  [985] "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##  [997] "No"  "No"  "No"  "No"
boost_pred1$confusion
##                Observed Class
## Predicted Class  No Yes
##             No  903  61
##             Yes   8  28
boost_pred1$error
## [1] 0.069
boost_pred1$error*nrow(health_test)
## [1] 69
#confusion matrix
     #Observed Class
#Predicted Class  No Yes
           # No  904  58
           # Yes   7  31
# total misclassified 69

#the random forest model misclassifies the smallest number in the testing set with 67 misclassifications unlike decision trees which had 89 misclassifcations and the boosting model had 69 total misclassifications

b) Using “Diabetes” as your target variable and only the numeric columns of data as covariates, develop a KNN model. Display a confusion matrix for your testing set. Based on this misclassification, do you think the KNN model is better or worse than those models developed in part a?

set.seed(1234)
# KNN model
library(class)
knn_health <- knn(train = health_train[,2:18], test = health_test[,2:18], cl = health_train[,19], k = 5)

#Show the predictions made on the testing set
#output dataset with predictions
knn_health
##    [1] No  No  No  No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No 
##   [19] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##   [37] No  No  No  No  No  Yes No  No  No  No  No  No  No  No  No  No  No  No 
##   [55] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##   [73] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##   [91] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes
##  [109] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [127] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [145] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [163] No  No  No  No  Yes Yes No  No  No  No  No  No  No  No  No  No  No  No 
##  [181] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [199] No  No  No  No  No  No  No  No  Yes No  No  No  No  No  No  No  No  No 
##  [217] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes
##  [235] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [253] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [271] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [289] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [307] No  No  No  No  No  No  No  No  No  No  No  Yes No  No  No  No  No  No 
##  [325] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [343] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [361] No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes No  No  No 
##  [379] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [397] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [415] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [433] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [451] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [469] No  No  No  No  No  No  No  No  Yes No  No  No  No  No  No  No  No  No 
##  [487] No  No  No  No  No  No  No  No  No  No  Yes No  No  No  No  No  No  No 
##  [505] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [523] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [541] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [559] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [577] No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [595] No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes No  No  No 
##  [613] No  No  No  No  No  No  No  No  No  No  Yes No  No  No  No  No  No  No 
##  [631] No  No  No  No  No  No  No  No  No  No  No  No  Yes No  No  No  No  No 
##  [649] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [667] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [685] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [703] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes
##  [721] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [739] No  No  No  No  No  No  No  No  No  No  No  No  Yes No  No  No  No  No 
##  [757] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [775] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  Yes
##  [793] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [811] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [829] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [847] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [865] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [883] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [901] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [919] Yes Yes No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [937] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [955] No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [973] No  No  No  No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [991] No  No  No  No  No  No  No  No  No  No 
## Levels: No Yes
# Confusion Matrix
table(health_test[,19], knn_health, dnn = c("True", "Predicted"))
##      Predicted
## True   No Yes
##   No  902   9
##   Yes  77  12
#True   No Yes
 # No  902   9
 # Yes  77  12
sum(health_test[,19] != knn_health) #number misclassified #number misclassified
## [1] 86
#total misclassified is 86, this model is worse than all models except the decision tree

Question 4

Develop a generalized additive model (GAM) to predict whether or not someone will default on a payment using the default_payments csv file. Use the “DEFAULT” column as your target, and all other variables as your covariates. Use smoothing splines for all numeric covariates. Use 80% of your data as your training set to develop your model, and use the rest of your data as your testing set.

set.seed(1234)
#read CSV file in
default1<- read.csv("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Test 1/default_payments.csv")

#column names
colnames(default1)
## [1] "LIMIT_BAL" "SEX"       "MARRIAGE"  "AGE"       "PAY_0"     "BILL_AMT1"
## [7] "PAY_AMT1"  "DEFAULT"
#what does the data look like
str(default1)
## 'data.frame':    12000 obs. of  8 variables:
##  $ LIMIT_BAL: int  290000 20000 280000 280000 20000 50000 20000 30000 200000 110000 ...
##  $ SEX      : chr  "F" "M" "M" "F" ...
##  $ MARRIAGE : chr  "Currently Married" "Currently Married" "Currently Married" "Currently Married" ...
##  $ AGE      : int  26 51 29 47 24 26 23 25 38 29 ...
##  $ PAY_0    : int  0 -1 -2 0 0 -2 1 1 -2 0 ...
##  $ BILL_AMT1: int  18125 780 10660 269124 17924 2411 16332 29628 5625 107195 ...
##  $ PAY_AMT1 : int  3000 0 5123 11268 1400 3068 2500 1600 12125 7845 ...
##  $ DEFAULT  : int  0 0 0 0 0 0 1 0 0 0 ...
# 80% sampling of the data
sample_default1 <- sample(nrow(default1), nrow(default1)*0.80)
default1_train <- default1[sample_default1, ]
default1_test <- default1[-sample_default1, ]

# library for GAM
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
# GAM model with smoothing splines on all numeric covariates
default1_gam <- gam (DEFAULT ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + (SEX) + (MARRIAGE), data = default1_train)
summary(default1_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + 
##     (SEX) + (MARRIAGE)
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.202645   0.006492  31.213  < 2e-16 ***
## SEXM           0.027993   0.007854   3.564 0.000367 ***
## MARRIAGESingle 0.010227   0.008709   1.174 0.240276    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df       F p-value    
## s(LIMIT_BAL) 8.141  8.694  17.029  <2e-16 ***
## s(AGE)       3.791  4.720   0.786  0.6576    
## s(PAY_0)     8.649  8.929 223.243  <2e-16 ***
## s(BILL_AMT1) 1.000  1.000  29.590  <2e-16 ***
## s(PAY_AMT1)  2.706  3.412   2.978  0.0243 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.205   Deviance explained = 20.7%
## GCV = 0.13618  Scale est. = 0.1358    n = 9600

#a) Are there any variables which appear to not have a statistically significant relationship with the target variable? If so, remove these variables, and create a new GAM that does not include these variables.

set.seed(1234)
# both marriage and age did not have a statistically significant relationships and will be removed below
default1_gam_model2 <- gam (DEFAULT ~ s(LIMIT_BAL) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + (SEX), data = default1_train)
summary(default1_gam_model2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + 
##     (SEX)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.207319   0.004877  42.511  < 2e-16 ***
## SEXM        0.028122   0.007778   3.616 0.000301 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df       F p-value    
## s(LIMIT_BAL) 8.173  8.713  17.544  <2e-16 ***
## s(PAY_0)     8.664  8.934 224.247  <2e-16 ***
## s(BILL_AMT1) 1.000  1.000  30.982  <2e-16 ***
## s(PAY_AMT1)  2.984  3.744   2.915  0.0225 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.205   Deviance explained = 20.6%
## GCV = 0.13616  Scale est. = 0.13584   n = 9600

#b) Which numeric covariate(s) are related linearly (rather than nonlinearly) to the target?

#bill_amt1 is related linearly due to it having an edf of 1.000
#s(BILL_AMT1) 1.000  1.000  30.982  <2e-16 ***

#c) Create a collection of plots that display the relationship between each covariate and its smoothing spline.

# plot of model2 with smoothing splines
#default1_gam_model2 <- gam (DEFAULT ~ s(LIMIT_BAL) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + (SEX), data = default1_train)
#summary(default1_gam_model2)

plot(default1_gam_model2, pages = 1)

#d) Using a cutoff value of 0.5, create a confusion matrix for the data in the testing set.

set.seed(1234)
pcut_default_gam_model2 <- 1/2 #defining the cut-off
prob_defautl_gam_model2_out <- predict(default1_gam_model2, default1_test, type = "response")
pred_default_gam_model2_out <- (prob_defautl_gam_model2_out >= pcut_default_gam_model2)*1
table(default1_test$DEFAULT, pred_default_gam_model2_out, dnn = c("Observed", "Predicted"))
##         Predicted
## Observed    0    1
##        0 1776   90
##        1  370  164
#confusion matrix
       #Predicted
#Observed    0    1
       #0 1776   90
       #1  370  164