flight <- read_excel("C:/Users/justt/Desktop/School/622/Exams/flight_data.xls")
Y <- flight$distance
X <- flight$speed
library(splines)
set.seed(1234)
fbsp <- lm(Y~bs(X, df = 5), data = flight)
summary(fbsp)
##
## Call:
## lm(formula = Y ~ bs(X, df = 5), data = flight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -541.47 -122.79 -5.74 131.70 544.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1211.2 111.7 10.842 < 2e-16 ***
## bs(X, df = 5)1 -365.2 177.9 -2.052 0.0408 *
## bs(X, df = 5)2 -494.2 104.6 -4.722 3.25e-06 ***
## bs(X, df = 5)3 1050.5 140.3 7.486 4.70e-13 ***
## bs(X, df = 5)4 3203.4 139.1 23.024 < 2e-16 ***
## bs(X, df = 5)5 5442.5 179.1 30.388 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194.2 on 394 degrees of freedom
## Multiple R-squared: 0.9637, Adjusted R-squared: 0.9632
## F-statistic: 2090 on 5 and 394 DF, p-value: < 2.2e-16
set.seed(1234)
fbsp_interval <- predict(fbsp, interval = "confidence", level = 0.70)
fbsp_interval
## fit lwr upr
## 1 1211.1619 1095.2248 1327.0990
## 2 1172.4360 1074.0206 1270.8513
## 3 1067.4686 1009.2992 1125.6380
## 4 1061.4940 1005.2173 1117.7706
## 5 1059.4027 1003.7762 1115.0293
## 6 1057.7753 1002.6502 1112.9005
## 7 987.6433 949.7622 1025.5245
## 8 966.1149 931.5559 1000.6740
## 9 965.5131 931.0329 999.9934
## 10 953.4851 920.4292 986.5410
## 11 951.7346 918.8628 984.6063
## 12 946.9374 914.5423 979.3325
## 13 943.9299 911.8137 976.0461
## 14 942.1182 910.1631 974.0734
## 15 937.4581 905.8949 969.0212
## 16 931.9344 900.7992 963.0696
## 17 931.0594 899.9888 962.1299
## 18 929.1353 898.2044 960.0663
## 19 925.8858 895.1835 956.5880
## 20 925.3933 894.7251 956.0616
## 21 923.7406 893.1856 954.2957
## 22 914.4049 884.4782 944.3316
## 23 912.4103 882.6210 942.1995
## 24 911.6770 881.9391 941.4148
## 25 909.1075 879.5549 938.6602
## 26 903.6303 874.5141 932.7464
## 27 902.4947 873.4799 931.5096
## 28 899.5013 870.7805 928.2221
## 29 898.3260 869.7344 926.9176
## 30 897.2415 868.7779 925.7050
## 31 896.3336 867.9851 924.6822
## 32 893.1243 865.2610 920.9875
## 33 891.5240 863.9736 919.0745
## 34 890.1536 862.9380 917.3692
## 35 889.1668 862.2586 916.0751
## 36 888.4741 861.8427 915.1056
## 37 887.2217 861.7741 912.6693
## 38 887.8116 863.2955 912.3276
## 39 888.4849 864.3995 912.5703
## 40 888.6514 864.6536 912.6492
## 41 888.9223 865.0572 912.7875
## 42 889.0497 865.2434 912.8560
## 43 889.1849 865.4389 912.9309
## 44 890.5044 867.2651 913.7436
## 45 891.8909 869.0836 914.6983
## 46 892.4449 869.7915 915.0983
## 47 892.7732 870.2071 915.3393
## 48 893.7960 871.4856 916.1064
## 49 894.7401 872.6473 916.8328
## 50 895.5826 873.6718 917.4935
## 51 896.3135 874.5522 918.0749
## 52 896.3711 874.6212 918.1209
## 53 897.0678 875.4536 918.6819
## 54 897.9052 876.4468 919.3636
## 55 899.0032 877.7381 920.2682
## 56 900.7223 879.7381 921.7064
## 57 901.0127 880.0737 921.9517
## 58 901.7558 880.9294 922.5822
## 59 903.3369 882.7372 923.9365
## 60 903.5747 883.0078 924.1416
## 61 904.8058 884.4028 925.2088
## 62 907.9404 887.9178 927.9630
## 63 909.5084 889.6585 929.3584
## 64 911.6681 892.0388 931.2973
## 65 912.7503 893.2248 932.2757
## 66 913.2253 893.7440 932.7065
## 67 916.0535 896.8192 935.2879
## 68 918.0916 899.0192 937.1640
## 69 920.4475 901.5472 939.3478
## 70 924.2746 905.6226 942.9266
## 71 930.6245 912.3100 948.9391
## 72 930.6734 912.3611 948.9857
## 73 932.8864 914.6723 951.1005
## 74 935.5143 917.4054 953.6232
## 75 936.8921 918.8337 954.9506
## 76 942.3407 924.4529 960.2285
## 77 944.9576 927.1368 962.7783
## 78 946.6150 928.8321 964.3979
## 79 948.4625 930.7178 966.2071
## 80 949.1487 931.4172 966.8802
## 81 951.8652 934.1808 969.5496
## 82 959.9494 942.3613 977.5375
## 83 960.2070 942.6210 977.7930
## 84 964.6725 947.1146 982.2304
## 85 965.8911 948.3383 983.4439
## 86 968.7431 951.1983 986.2879
## 87 969.5261 951.9825 987.0696
## 88 973.1692 955.6267 990.7117
## 89 975.2680 957.7228 992.8133
## 90 982.0737 964.5049 999.6425
## 91 982.8311 965.2584 1000.4038
## 92 983.1030 965.5289 1000.6771
## 93 983.3958 965.8202 1000.9715
## 94 986.4588 968.8647 1004.0529
## 95 991.4619 973.8311 1009.0927
## 96 992.2452 974.6080 1009.8824
## 97 992.6673 975.0266 1010.3080
## 98 992.9007 975.2580 1010.5434
## 99 996.0859 978.4151 1013.7567
## 100 997.5677 979.8830 1015.2523
## 101 998.0999 980.4102 1015.7897
## 102 998.5923 980.8978 1016.2868
## 103 999.9081 982.2007 1017.6156
## 104 1009.5295 991.7197 1027.3393
## 105 1013.8029 995.9445 1031.6613
## 106 1014.5582 996.6912 1032.4253
## 107 1023.0566 1005.0903 1041.0228
## 108 1024.1331 1006.1542 1042.1120
## 109 1024.2983 1006.3175 1042.2792
## 110 1024.6347 1006.6499 1042.6194
## 111 1024.6844 1006.6991 1042.6697
## 112 1025.1276 1007.1371 1043.1181
## 113 1025.2338 1007.2421 1043.2256
## 114 1027.3025 1009.2866 1045.3184
## 115 1030.7718 1012.7157 1048.8279
## 116 1031.9767 1013.9068 1050.0466
## 117 1036.2311 1018.1131 1054.3491
## 118 1036.8782 1018.7530 1055.0035
## 119 1038.3879 1020.2460 1056.5298
## 120 1040.8621 1022.6933 1059.0309
## 121 1041.9993 1023.8183 1060.1803
## 122 1043.5277 1025.3305 1061.7249
## 123 1050.1701 1031.9055 1068.4347
## 124 1055.1206 1036.8093 1073.4318
## 125 1055.4238 1037.1098 1073.7378
## 126 1056.0051 1037.6858 1074.3243
## 127 1069.5108 1051.0843 1087.9372
## 128 1074.8795 1056.4189 1093.3401
## 129 1079.9056 1061.4177 1098.3935
## 130 1082.7127 1064.2116 1101.2138
## 131 1091.1128 1072.5813 1109.6443
## 132 1091.4588 1072.9263 1109.9913
## 133 1103.7872 1085.2364 1122.3379
## 134 1111.7888 1093.2429 1130.3347
## 135 1120.2297 1101.7031 1138.7562
## 136 1127.7918 1109.2944 1146.2893
## 137 1130.8198 1112.3369 1149.3028
## 138 1136.6220 1118.1713 1155.0727
## 139 1137.5323 1119.0872 1155.9774
## 140 1137.7836 1119.3401 1156.2271
## 141 1141.3509 1122.9306 1159.7712
## 142 1141.8914 1123.4748 1160.3080
## 143 1145.0768 1126.6828 1163.4707
## 144 1146.3269 1127.9423 1164.7116
## 145 1149.3677 1131.0067 1167.7288
## 146 1149.6546 1131.2958 1168.0133
## 147 1150.1469 1131.7921 1168.5017
## 148 1151.8565 1133.5157 1170.1972
## 149 1158.1970 1139.9116 1176.4825
## 150 1167.1266 1148.9275 1185.3257
## 151 1174.7724 1156.6543 1192.8905
## 152 1178.6239 1160.5488 1196.6990
## 153 1179.5270 1161.4622 1197.5918
## 154 1183.4649 1165.4457 1201.4840
## 155 1187.1565 1169.1813 1205.1316
## 156 1193.6067 1175.7111 1211.5024
## 157 1195.8015 1177.9335 1213.6695
## 158 1196.8326 1178.9777 1214.6874
## 159 1201.4171 1183.6214 1219.2127
## 160 1202.3021 1184.5180 1220.0861
## 161 1203.3121 1185.5412 1221.0829
## 162 1211.4908 1193.8294 1229.1522
## 163 1213.5565 1195.9232 1231.1898
## 164 1220.4855 1202.9479 1238.0231
## 165 1221.2791 1203.7526 1238.8057
## 166 1222.3089 1204.7968 1239.8211
## 167 1222.4828 1204.9731 1239.9926
## 168 1227.7754 1210.3402 1245.2107
## 169 1233.5254 1216.1719 1250.8789
## 170 1245.2626 1228.0776 1262.4475
## 171 1250.3259 1233.2139 1267.4378
## 172 1252.5546 1235.4749 1269.6344
## 173 1262.7268 1245.7935 1279.6601
## 174 1263.2794 1246.3541 1280.2048
## 175 1263.9813 1247.0661 1280.8966
## 176 1267.4552 1250.5896 1284.3207
## 177 1268.8302 1251.9843 1285.6761
## 178 1275.5823 1258.8323 1292.3324
## 179 1280.9700 1264.2957 1297.6443
## 180 1296.9557 1280.5007 1313.4106
## 181 1297.0129 1280.5587 1313.4671
## 182 1303.4020 1287.0327 1319.7713
## 183 1307.0152 1290.6931 1323.3374
## 184 1312.4072 1296.1543 1328.6601
## 185 1324.1460 1308.0385 1340.2536
## 186 1329.2190 1313.1717 1345.2663
## 187 1339.3750 1323.4436 1355.3064
## 188 1343.9519 1328.0704 1359.8333
## 189 1358.3201 1342.5858 1374.0545
## 190 1358.8500 1343.1208 1374.5792
## 191 1358.9272 1343.1988 1374.6557
## 192 1375.6998 1360.1223 1391.2772
## 193 1376.7390 1361.1701 1392.3079
## 194 1392.9358 1377.4894 1408.3821
## 195 1397.5125 1382.0967 1412.9283
## 196 1410.2689 1394.9287 1425.6091
## 197 1415.1756 1399.8606 1430.4906
## 198 1423.4326 1408.1553 1438.7099
## 199 1426.7138 1411.4498 1441.9778
## 200 1427.2047 1411.9426 1442.4668
## 201 1441.1065 1425.8895 1456.3236
## 202 1442.6841 1427.4711 1457.8971
## 203 1446.9871 1431.7840 1462.1903
## 204 1450.1854 1434.9886 1465.3823
## 205 1454.5901 1439.4004 1469.7798
## 206 1467.1886 1452.0103 1482.3670
## 207 1470.4595 1455.2819 1485.6372
## 208 1473.7050 1458.5272 1488.8828
## 209 1473.7749 1458.5971 1488.9527
## 210 1482.4560 1467.2734 1497.6386
## 211 1487.1618 1471.9740 1502.3496
## 212 1487.6783 1472.4898 1502.8668
## 213 1495.4332 1480.2320 1510.6344
## 214 1498.3004 1483.0933 1513.5076
## 215 1504.0993 1488.8783 1519.3202
## 216 1512.7282 1497.4819 1527.9744
## 217 1514.5746 1499.3223 1529.8270
## 218 1515.3459 1500.0910 1530.6009
## 219 1515.4832 1500.2278 1530.7387
## 220 1516.5968 1501.3375 1531.8561
## 221 1517.1910 1501.9295 1532.4524
## 222 1529.9323 1514.6198 1545.2447
## 223 1538.5240 1523.1710 1553.8769
## 224 1541.6650 1526.2960 1557.0340
## 225 1565.7534 1550.2426 1581.2642
## 226 1566.6224 1551.1059 1582.1389
## 227 1567.8744 1552.3495 1583.3992
## 228 1597.2266 1581.4862 1612.9669
## 229 1600.5025 1584.7358 1616.2692
## 230 1610.2859 1594.4384 1626.1335
## 231 1612.6194 1596.7521 1628.4867
## 232 1616.1131 1600.2159 1632.0103
## 233 1644.7069 1628.5526 1660.8612
## 234 1645.8564 1629.6914 1662.0214
## 235 1658.9868 1642.6978 1675.2757
## 236 1659.5638 1643.2693 1675.8583
## 237 1659.7860 1643.4894 1676.0826
## 238 1663.3240 1646.9936 1679.6545
## 239 1664.2895 1647.9498 1680.6292
## 240 1685.5347 1668.9891 1702.0803
## 241 1689.5712 1672.9862 1706.1562
## 242 1689.9821 1673.3931 1706.5712
## 243 1697.9084 1681.2420 1714.5749
## 244 1707.8040 1691.0410 1724.5670
## 245 1709.6098 1692.8293 1726.3904
## 246 1722.0675 1705.1661 1738.9689
## 247 1724.4944 1707.5696 1741.4191
## 248 1727.1543 1710.2040 1744.1047
## 249 1734.4021 1717.3824 1751.4217
## 250 1745.2907 1728.1682 1762.4133
## 251 1752.1779 1734.9911 1769.3646
## 252 1753.3855 1736.1876 1770.5834
## 253 1757.2831 1740.0494 1774.5169
## 254 1759.1058 1741.8553 1776.3562
## 255 1761.4145 1744.1430 1778.6859
## 256 1770.8932 1753.5364 1788.2499
## 257 1775.2380 1757.8429 1792.6332
## 258 1777.1541 1759.7421 1794.5661
## 259 1796.5967 1779.0192 1814.1743
## 260 1799.7933 1782.1895 1817.3971
## 261 1801.8982 1784.2773 1819.5192
## 262 1805.4925 1787.8426 1823.1424
## 263 1816.5415 1798.8050 1834.2781
## 264 1822.1642 1804.3849 1839.9434
## 265 1823.7654 1805.9742 1841.5566
## 266 1824.5042 1806.7075 1842.3009
## 267 1872.7377 1854.6239 1890.8515
## 268 1879.2950 1861.1448 1897.4452
## 269 1890.9392 1872.7286 1909.1499
## 270 1912.3610 1894.0525 1930.6695
## 271 1916.0770 1897.7533 1934.4007
## 272 1922.6819 1904.3322 1941.0315
## 273 1923.6421 1905.2889 1941.9954
## 274 1937.4639 1919.0619 1955.8659
## 275 1939.0853 1920.6780 1957.4925
## 276 1969.4533 1950.9628 1987.9438
## 277 1969.4707 1950.9802 1987.9612
## 278 1984.5592 1966.0380 2003.0805
## 279 1992.8913 1974.3559 2011.4267
## 280 1996.1953 1977.6549 2014.7358
## 281 2007.4052 1988.8497 2025.9607
## 282 2010.0432 1991.4847 2028.6018
## 283 2014.4842 1995.9209 2033.0476
## 284 2015.7585 1997.1939 2034.3231
## 285 2024.0456 2005.4737 2042.6175
## 286 2042.6167 2024.0343 2061.1992
## 287 2049.5739 2030.9896 2068.1583
## 288 2052.8317 2034.2467 2071.4167
## 289 2065.6891 2047.1038 2084.2743
## 290 2065.8089 2047.2236 2084.3941
## 291 2076.0466 2057.4635 2094.6298
## 292 2082.3731 2063.7921 2100.9541
## 293 2082.5666 2063.9856 2101.1475
## 294 2091.0744 2072.4974 2109.6514
## 295 2094.4717 2075.8966 2113.0467
## 296 2098.8221 2080.2497 2117.3945
## 297 2117.1253 2098.5671 2135.6834
## 298 2119.0599 2100.5035 2137.6163
## 299 2135.0881 2116.5478 2153.6284
## 300 2140.4090 2121.8747 2158.9433
## 301 2149.7599 2131.2367 2168.2832
## 302 2157.2133 2138.6994 2175.7272
## 303 2164.2363 2145.7316 2182.7410
## 304 2175.1980 2156.7082 2193.6878
## 305 2179.8993 2161.4160 2198.3825
## 306 2186.5107 2168.0370 2204.9845
## 307 2187.7111 2169.2391 2206.1832
## 308 2194.9307 2176.4692 2213.3922
## 309 2204.5618 2186.1145 2223.0090
## 310 2208.5977 2190.1565 2227.0390
## 311 2216.6289 2198.1997 2235.0581
## 312 2226.0737 2207.6587 2244.4887
## 313 2231.3707 2212.9636 2249.7777
## 314 2234.1767 2215.7738 2252.5795
## 315 2234.5434 2216.1411 2252.9457
## 316 2249.1168 2230.7360 2267.4976
## 317 2255.6727 2237.3013 2274.0440
## 318 2271.5607 2253.2115 2289.9099
## 319 2281.0034 2262.6667 2299.3400
## 320 2299.1356 2280.8213 2317.4499
## 321 2309.2163 2290.9133 2327.5194
## 322 2315.2371 2296.9404 2333.5338
## 323 2337.8975 2319.6212 2356.1738
## 324 2341.4471 2323.1734 2359.7207
## 325 2347.1534 2328.8837 2365.4230
## 326 2382.8031 2364.5485 2401.0577
## 327 2435.3409 2417.0723 2453.6094
## 328 2457.0773 2438.7885 2475.3661
## 329 2502.4651 2484.1035 2520.8266
## 330 2504.2026 2485.8374 2522.5677
## 331 2505.1221 2486.7550 2523.4892
## 332 2559.6036 2541.0875 2578.1196
## 333 2569.7509 2551.1997 2588.3020
## 334 2584.2249 2565.6196 2602.8302
## 335 2590.8242 2572.1927 2609.4558
## 336 2617.1260 2598.3798 2635.8722
## 337 2639.4510 2620.5951 2658.3070
## 338 2646.5297 2627.6366 2665.4228
## 339 2668.8137 2649.7961 2687.8312
## 340 2701.2908 2682.0723 2720.5094
## 341 2729.4017 2709.9907 2748.8126
## 342 2735.3625 2715.9086 2754.8164
## 343 2736.8153 2717.3508 2756.2797
## 344 2744.8869 2725.3629 2764.4109
## 345 2773.7232 2753.9759 2793.4705
## 346 2809.3931 2789.3476 2829.4387
## 347 2832.8582 2812.6040 2853.1125
## 348 2920.0608 2898.9549 2941.1667
## 349 2939.8471 2918.5334 2961.1608
## 350 2967.8333 2946.2178 2989.4488
## 351 2978.8479 2957.1114 3000.5845
## 352 2998.3729 2976.4188 3020.3270
## 353 3001.1933 2979.2076 3023.1791
## 354 3046.3445 3023.8420 3068.8469
## 355 3048.9207 3026.3883 3071.4531
## 356 3080.1273 3057.2290 3103.0257
## 357 3123.7216 3100.3035 3147.1398
## 358 3129.2648 3105.7799 3152.7496
## 359 3144.1091 3120.4453 3167.7729
## 360 3154.0328 3130.2490 3177.8167
## 361 3159.7841 3135.9306 3183.6376
## 362 3173.3051 3149.2875 3197.3226
## 363 3198.3112 3173.9895 3222.6330
## 364 3222.4982 3197.8818 3247.1147
## 365 3228.1590 3203.4736 3252.8445
## 366 3240.0340 3215.2039 3264.8641
## 367 3252.8637 3227.8773 3277.8500
## 368 3310.2881 3284.6052 3335.9711
## 369 3354.9152 3328.6964 3381.1339
## 370 3405.3885 3378.5725 3432.2044
## 371 3476.3661 3448.7311 3504.0012
## 372 3488.0445 3460.2774 3515.8117
## 373 3494.3783 3466.5398 3522.2167
## 374 3552.9379 3524.4524 3581.4235
## 375 3571.4390 3542.7537 3600.1243
## 376 3651.7959 3622.2710 3681.3207
## 377 3727.4207 3697.1485 3757.6928
## 378 3734.0565 3703.7208 3764.3922
## 379 3736.9189 3706.5559 3767.2820
## 380 3763.2712 3732.6593 3793.8830
## 381 3780.0442 3749.2766 3810.8118
## 382 3971.7498 3939.3327 4004.1669
## 383 3982.9112 3950.4046 4015.4179
## 384 4069.1328 4035.9507 4102.3148
## 385 4173.0696 4139.0981 4207.0412
## 386 4190.2802 4156.1786 4224.3819
## 387 4234.0701 4199.6361 4268.5041
## 388 4260.3673 4225.7319 4295.0027
## 389 4262.6372 4227.9843 4297.2901
## 390 4473.9336 4437.5185 4510.3488
## 391 4565.8056 4528.4798 4603.1313
## 392 4687.4681 4648.7158 4726.2205
## 393 4750.3538 4710.7392 4789.9684
## 394 4969.5808 4926.0990 5013.0626
## 395 4987.6054 4943.7369 5031.4740
## 396 5248.9184 5198.0616 5299.7751
## 397 5253.5425 5202.5373 5304.5476
## 398 5642.3197 5575.5448 5709.0947
## 399 6094.8633 6001.5138 6188.2128
## 400 6653.6284 6515.9397 6791.3172
plot(X, Y, xlab = "Speed", main = "Regression Spline with df = 5", ylab = "Distance", cex = .5)
lines(X, fbsp$fitted.values, col = "blue", lwd = 1)
lines(X, fbsp_interval[,2], col = "red", lty = 2)
lines(X, fbsp_interval[,3], col = "red", lty = 2)
set.seed(1234)
fcsp = lm(Y~ns(X, df = 3), data = flight)
set.seed(1234)
anova(fbsp, fcsp)
## Analysis of Variance Table
##
## Model 1: Y ~ bs(X, df = 5)
## Model 2: Y ~ ns(X, df = 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 394 14851546
## 2 396 15155277 -2 -303730 4.0289 0.01853 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model 1, b spline, has the smaller sum of squared errors at 14851546, opposed to Model 2, natural cubic spine, with a value of 15155277.
There is a statistically significant difference between the predictive performance of the two models as the P-value is 0.01853, which is less than 0.05.
The model that is best at predicting distance is the b spline model because the RSS is lower than that of the natural cubic model, and it also has an Adjusted R-squared of 0.9632, 96.32% which is a pretty large indicator that this is good at predicting the distance variable.
set.seed(1234)
health <- read.csv("C:/Users/justt/Desktop/School/622/Exams/health_dataset.csv")
str(health)
## 'data.frame': 2000 obs. of 20 variables:
## $ Gender : chr "female" "female" "female" "female" ...
## $ Age : int 49 45 45 45 66 54 50 16 56 56 ...
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ Pulse : int 86 62 62 62 60 76 74 76 64 64 ...
## $ BPSysAve : int 112 118 118 118 111 134 142 126 95 95 ...
## $ BPDiaAve : int 75 64 64 64 63 85 68 72 69 69 ...
## $ BPSys1 : int 118 106 106 106 124 136 138 132 94 94 ...
## $ BPDia1 : int 82 62 62 62 64 86 66 74 74 74 ...
## $ BPSys2 : int 108 118 118 118 108 132 142 126 94 94 ...
## $ BPDia2 : int 74 68 68 68 62 88 74 68 70 70 ...
## $ BPSys3 : int 116 118 118 118 114 136 142 126 96 96 ...
## $ BPDia3 : int 76 60 60 60 64 82 62 76 68 68 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
sample_index <- sample(nrow(health), nrow(health)*0.50)
health_train <- health[sample_index, ]
health_test <- health[-sample_index, ]
# decision tree
set.seed(1234)
dtree_train <- rpart(formula = health_train$Diabetes~., data = health_train, method = "class")
dtree_test_pred <- predict(dtree_train, health_test)
#A table(health_test$Diabetes, dtree_test_pred, dnn = c("Observed", "Predicted"))
# random forest with 100 trees
set.seed(1234)
health_train$Diabetes <- as.factor(health_train$Diabetes)
health_train$Gender <- as.factor(health_train$Gender)
health_train$SleepTrouble <- as.factor(health_train$SleepTrouble )
health_test$Diabetes <- as.factor(health_test$Diabetes)
health_test$Gender <- as.factor(health_test$Gender)
health_test$SleepTrouble <- as.factor(health_test$SleepTrouble )
rf_train <- randomForest(Diabetes ~., data = health_train, importance = TRUE, ntree = 100)
rf_test_pred <- predict(rf_train, health_test)
table(health_test$Diabetes, rf_test_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 910 1
## Yes 66 23
# boosting model
set.seed(1234)
#I could have reloaded the Health data and then converted these records and then re-ran the train and test data set, but I cohose to not to because of the other models. I would have been better off doing this in the beginning. And I would if I had more time.
health_train$Diabetes <- as.factor(health_train$Diabetes)
health_train$Gender <- as.factor(health_train$Gender)
health_train$SleepTrouble <- as.factor(health_train$SleepTrouble )
health_test$Diabetes <- as.factor(health_test$Diabetes)
health_test$Gender <- as.factor(health_test$Gender)
health_test$SleepTrouble <- as.factor(health_test$SleepTrouble )
htboost <- boosting(Diabetes ~., data = health_train, boos = T)
htboost_pred <- predict(htboost, newdata = health_test)
htboost_pred$confusion
## Observed Class
## Predicted Class No Yes
## No 903 61
## Yes 8 28
::: Show all the confusion matrixes for comparison :::
#A table(health_test$Diabetes, dtree_test_pred, dnn = c("Observed", "Predicted"))
table(health_test$Diabetes, rf_test_pred, dnn = c("Observed", "Predicted"))
## Predicted
## Observed No Yes
## No 910 1
## Yes 66 23
htboost_pred$confusion
## Observed Class
## Predicted Class No Yes
## No 903 61
## Yes 8 28
library(class)
health1 <- read.csv("C:/Users/justt/Desktop/School/622/Exams/health_dataset.csv")
str(health1)
## 'data.frame': 2000 obs. of 20 variables:
## $ Gender : chr "female" "female" "female" "female" ...
## $ Age : int 49 45 45 45 66 54 50 16 56 56 ...
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ Pulse : int 86 62 62 62 60 76 74 76 64 64 ...
## $ BPSysAve : int 112 118 118 118 111 134 142 126 95 95 ...
## $ BPDiaAve : int 75 64 64 64 63 85 68 72 69 69 ...
## $ BPSys1 : int 118 106 106 106 124 136 138 132 94 94 ...
## $ BPDia1 : int 82 62 62 62 64 86 66 74 74 74 ...
## $ BPSys2 : int 108 118 118 118 108 132 142 126 94 94 ...
## $ BPDia2 : int 74 68 68 68 62 88 74 68 70 70 ...
## $ BPSys3 : int 116 118 118 118 114 136 142 126 96 96 ...
## $ BPDia3 : int 76 60 60 60 64 82 62 76 68 68 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- health[,-1:-2]
str(healthnum)
## 'data.frame': 2000 obs. of 18 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ Pulse : int 86 62 62 62 60 76 74 76 64 64 ...
## $ BPSysAve : int 112 118 118 118 111 134 142 126 95 95 ...
## $ BPDiaAve : int 75 64 64 64 63 85 68 72 69 69 ...
## $ BPSys1 : int 118 106 106 106 124 136 138 132 94 94 ...
## $ BPDia1 : int 82 62 62 62 64 86 66 74 74 74 ...
## $ BPSys2 : int 108 118 118 118 108 132 142 126 94 94 ...
## $ BPDia2 : int 74 68 68 68 62 88 74 68 70 70 ...
## $ BPSys3 : int 116 118 118 118 114 136 142 126 96 96 ...
## $ BPDia3 : int 76 60 60 60 64 82 62 76 68 68 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- healthnum[,-4:-12]
str(healthnum)
## 'data.frame': 2000 obs. of 9 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineVol1 : int 77 106 106 106 113 215 64 345 26 26 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- healthnum[,-6]
str(healthnum)
## 'data.frame': 2000 obs. of 8 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ DirectChol : num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineFlow1 : num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ SleepTrouble: chr "Yes" "No" "No" "No" ...
healthnum <- healthnum[,-8]
str(healthnum)
## 'data.frame': 2000 obs. of 7 variables:
## $ Weight : num 86.7 75.7 75.7 75.7 68 74.7 84.1 73.2 57.5 57.5 ...
## $ Height : num 168 167 167 167 170 ...
## $ BMI : num 30.6 27.2 27.2 27.2 23.7 ...
## $ DirectChol: num 1.16 2.12 2.12 2.12 0.67 1.16 1.06 1.14 2.22 2.22 ...
## $ TotChol : num 6.7 5.82 5.82 5.82 4.99 6.41 5.22 3 5.79 5.79 ...
## $ UrineFlow1: num 0.094 1.116 1.116 1.116 0.489 ...
## $ Diabetes : chr "No" "No" "No" "No" ...
sample_index <- sample(nrow(healthnum), nrow(healthnum)*0.50)
healthnum_train <- healthnum[sample_index, ]
healthnum_test <- healthnum[-sample_index, ]
knn_healthnum <- knn(train = healthnum_train[, -7], test = healthnum_test[, -7], cl = healthnum_train[, 7], k = 5)
knn_healthnum
## [1] No No No No No No No No No No No No No No No No No No
## [19] No No No No No No No No No No No No No No No No No No
## [37] No No No No No No No No No No No No No No No No No No
## [55] No No No No No No Yes Yes No No No No No No No No Yes No
## [73] No No No No No No No No No No No No No No No No No No
## [91] No No No No No No No No No No No No No No No No No No
## [109] No No No No No No No No No No No No No No No No No No
## [127] Yes No No No No No No No No No No No No No No No No No
## [145] No No No No No No No No No No No No No No No No No No
## [163] No No No No No No No No No No No No No No No Yes No No
## [181] No No No No No No No No No No No No No No No No Yes No
## [199] No No No No No No No No No No No No No No No No No No
## [217] No No No No Yes Yes No No No No No No No No No No No No
## [235] No No No No No No No No No No No No No Yes No No No No
## [253] No No No No No No No No No No No No No No No No No No
## [271] No No No No No No No No No No No No No No No No No No
## [289] No No No No No No No No No No No Yes No No No No No No
## [307] No No No No No No No No No No No No No No No No No No
## [325] No Yes No No No No No No No No No No No No No No No No
## [343] No No No No No No No No No No No No No No No No No No
## [361] No No No Yes No No No No No No No No No No No No No No
## [379] No No No No No No No No No No No No No No No No No No
## [397] No No No No No No No No No No No No No No No No No No
## [415] No No No No No No No No No No No No No No No No No No
## [433] No No No No No No No No No No No No No No No No No Yes
## [451] No No No No No No No No No No No No No No No No No No
## [469] No No No No No No No No No No No No No Yes No No No No
## [487] No No No No No No No No No No No No No No No No No No
## [505] No No No Yes No No No No No No No No No No No No No No
## [523] No No No No No No No No No No No No No No No No No No
## [541] No No No No No No No No No No No No Yes No No No No No
## [559] No No No No No No No No No No No No No No No No No No
## [577] No No No No No No No No No No No No No No No No No No
## [595] No No No No No No No No No No No No No No No No No No
## [613] No No No No No No No No No No No No No No No No No No
## [631] No No No No No No No No No No No No No No No No No No
## [649] No No No No No No No Yes No No No No No No Yes No No No
## [667] No No No No No No No No No No No No No No No No No No
## [685] No No No No No No No No No No No No No No No No No No
## [703] No No No No No No No No No No No No No No No No No No
## [721] No No No No No No No No No No No No No No No No No No
## [739] No No No No No No No No No No No No No No No No No No
## [757] No No No No No No No No No No No No No No No No No No
## [775] No No No No No No No No No No No No No No No No No No
## [793] No No Yes No No No No No No No No No No No No No No No
## [811] No No No No No No No No No No No No No No No No No No
## [829] No No No No No No No No No No No No No No No No No No
## [847] No No No No No No No No No No No No No No No No No No
## [865] No No No No No No Yes No No No No No No No Yes No No No
## [883] No No No No No No No No No No No No No No No No No No
## [901] No No No No No No No No No No No No Yes Yes Yes No No No
## [919] No No No No No No No No No No No No No No No No No No
## [937] No No No No No No No No No No No No No No No No No No
## [955] No No No No No No No No No No No No No No No No No No
## [973] No No No No No No No No Yes No No No No No No No No No
## [991] No No No No No No No No No No
## Levels: No Yes
table(healthnum_test[,7], knn_healthnum, dnn = c("True", "Predicted"))
## Predicted
## True No Yes
## No 903 18
## Yes 72 7
sum(healthnum_test[, 7] != knn_healthnum) #number misclassified
## [1] 90
::: 4. Develop a generalized additive model (GAM) to predict whether or not someone will default on a payment using the default_payments csv file. Use the “DEFAULT” column as your target, and all other variables as your covariates. Use smoothing splines for all numeric covariates. Use 80% of your data as your training set to develop your model, and use the rest of your data as your testing set. (Note that the default_payments csv file contains different data than that in the credit_default csv file that was provided for practice in our Google Drive.) :::
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
set.seed(1234)
payments <- read.csv("C:/Users/justt/Desktop/School/622/Exams/default_payments.csv")
payments$LIMIT_BAL <- as.numeric(payments$LIMIT_BAL)
payments$PAY_0 <- as.numeric(payments$PAY_0)
payments$BILL_AMT1 <- as.numeric(payments$BILL_AMT1)
payments$PAY_AMT1 <- as.numeric(payments$PAY_AMT1)
sample_index <- sample(nrow(payments), nrow(payments)*0.80)
payments_train <- payments[sample_index,]
payments_test <- payments[-sample_index,]
str(payments)
## 'data.frame': 12000 obs. of 8 variables:
## $ LIMIT_BAL: num 290000 20000 280000 280000 20000 50000 20000 30000 200000 110000 ...
## $ SEX : chr "F" "M" "M" "F" ...
## $ MARRIAGE : chr "Currently Married" "Currently Married" "Currently Married" "Currently Married" ...
## $ AGE : int 26 51 29 47 24 26 23 25 38 29 ...
## $ PAY_0 : num 0 -1 -2 0 0 -2 1 1 -2 0 ...
## $ BILL_AMT1: num 18125 780 10660 269124 17924 ...
## $ PAY_AMT1 : num 3000 0 5123 11268 1400 ...
## $ DEFAULT : int 0 0 0 0 0 0 1 0 0 0 ...
payments_gam <- gam(DEFAULT ~ s(LIMIT_BAL) + SEX + MARRIAGE + AGE + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) ,data =payments_train )
summary(payments_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + SEX + MARRIAGE + AGE + s(PAY_0) + s(BILL_AMT1) +
## s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1929612 0.0156241 12.350 < 2e-16 ***
## SEXM 0.0279813 0.0078294 3.574 0.000353 ***
## MARRIAGESingle 0.0116421 0.0085311 1.365 0.172392
## AGE 0.0002544 0.0004656 0.546 0.584833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 8.104 8.671 18.009 <2e-16 ***
## s(PAY_0) 8.656 8.931 223.283 <2e-16 ***
## s(BILL_AMT1) 1.000 1.000 29.393 <2e-16 ***
## s(PAY_AMT1) 2.713 3.420 2.928 0.026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.205 Deviance explained = 20.7%
## GCV = 0.13617 Scale est. = 0.13582 n = 9600
#Marriage and Age are not statistically significant as their P-Values are greater than .05
set.seed(1234)
payments_gam1 <- gam(DEFAULT ~ s(LIMIT_BAL) + SEX + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) ,data =payments_train )
summary(payments_gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + SEX + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.207319 0.004877 42.511 < 2e-16 ***
## SEXM 0.028122 0.007778 3.616 0.000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 8.173 8.713 17.544 <2e-16 ***
## s(PAY_0) 8.664 8.934 224.247 <2e-16 ***
## s(BILL_AMT1) 1.000 1.000 30.982 <2e-16 ***
## s(PAY_AMT1) 2.984 3.744 2.915 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.205 Deviance explained = 20.6%
## GCV = 0.13616 Scale est. = 0.13584 n = 9600
Marriage and Age are not statistically significant as their P-Values are greater that .05. After the new GAM model is ran removing these variable, all remaining variables are statistically significant.
set.seed(1234)
summary(payments_gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + SEX + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.207319 0.004877 42.511 < 2e-16 ***
## SEXM 0.028122 0.007778 3.616 0.000301 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 8.173 8.713 17.544 <2e-16 ***
## s(PAY_0) 8.664 8.934 224.247 <2e-16 ***
## s(BILL_AMT1) 1.000 1.000 30.982 <2e-16 ***
## s(PAY_AMT1) 2.984 3.744 2.915 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.205 Deviance explained = 20.6%
## GCV = 0.13616 Scale est. = 0.13584 n = 9600
The only numeric value that is related linearly to DEFAULT is BILL_AMT1.
set.seed(1234)
plot(payments_gam1, pages = 1)
set.seed(1234)
pcut_gam <- 1/2
prob_payments_gam1 <- predict(payments_gam1, payments_test, type = "response")
pred_payments_gam1 <- (prob_payments_gam1 >= pcut_gam)*1
table(payments_test$DEFAULT, pred_payments_gam1, dnn = c("Observed", "Predicted"))
## Predicted
## Observed 0 1
## 0 1776 90
## 1 370 164