library(readxl)
flightdata <- read_excel("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Exam 1/flight_data.xls")
X <- flightdata$speed
Y <- flightdata$distance
library(splines)
flight_sp <- lm(Y~bs(X, df = 5), data = flightdata)
summary(flight_sp)
##
## Call:
## lm(formula = Y ~ bs(X, df = 5), data = flightdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -541.47 -122.79 -5.74 131.70 544.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1211.2 111.7 10.842 < 2e-16 ***
## bs(X, df = 5)1 -365.2 177.9 -2.052 0.0408 *
## bs(X, df = 5)2 -494.2 104.6 -4.722 3.25e-06 ***
## bs(X, df = 5)3 1050.5 140.3 7.486 4.70e-13 ***
## bs(X, df = 5)4 3203.4 139.1 23.024 < 2e-16 ***
## bs(X, df = 5)5 5442.5 179.1 30.388 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194.2 on 394 degrees of freedom
## Multiple R-squared: 0.9637, Adjusted R-squared: 0.9632
## F-statistic: 2090 on 5 and 394 DF, p-value: < 2.2e-16
conf_interval <- predict(flight_sp, interval = "confidence", level = 0.70)
conf_interval
## fit lwr upr
## 1 1211.1619 1095.2248 1327.0990
## 2 1172.4360 1074.0206 1270.8513
## 3 1067.4686 1009.2992 1125.6380
## 4 1061.4940 1005.2173 1117.7706
## 5 1059.4027 1003.7762 1115.0293
## 6 1057.7753 1002.6502 1112.9005
## 7 987.6433 949.7622 1025.5245
## 8 966.1149 931.5559 1000.6740
## 9 965.5131 931.0329 999.9934
## 10 953.4851 920.4292 986.5410
## 11 951.7346 918.8628 984.6063
## 12 946.9374 914.5423 979.3325
## 13 943.9299 911.8137 976.0461
## 14 942.1182 910.1631 974.0734
## 15 937.4581 905.8949 969.0212
## 16 931.9344 900.7992 963.0696
## 17 931.0594 899.9888 962.1299
## 18 929.1353 898.2044 960.0663
## 19 925.8858 895.1835 956.5880
## 20 925.3933 894.7251 956.0616
## 21 923.7406 893.1856 954.2957
## 22 914.4049 884.4782 944.3316
## 23 912.4103 882.6210 942.1995
## 24 911.6770 881.9391 941.4148
## 25 909.1075 879.5549 938.6602
## 26 903.6303 874.5141 932.7464
## 27 902.4947 873.4799 931.5096
## 28 899.5013 870.7805 928.2221
## 29 898.3260 869.7344 926.9176
## 30 897.2415 868.7779 925.7050
## 31 896.3336 867.9851 924.6822
## 32 893.1243 865.2610 920.9875
## 33 891.5240 863.9736 919.0745
## 34 890.1536 862.9380 917.3692
## 35 889.1668 862.2586 916.0751
## 36 888.4741 861.8427 915.1056
## 37 887.2217 861.7741 912.6693
## 38 887.8116 863.2955 912.3276
## 39 888.4849 864.3995 912.5703
## 40 888.6514 864.6536 912.6492
## 41 888.9223 865.0572 912.7875
## 42 889.0497 865.2434 912.8560
## 43 889.1849 865.4389 912.9309
## 44 890.5044 867.2651 913.7436
## 45 891.8909 869.0836 914.6983
## 46 892.4449 869.7915 915.0983
## 47 892.7732 870.2071 915.3393
## 48 893.7960 871.4856 916.1064
## 49 894.7401 872.6473 916.8328
## 50 895.5826 873.6718 917.4935
## 51 896.3135 874.5522 918.0749
## 52 896.3711 874.6212 918.1209
## 53 897.0678 875.4536 918.6819
## 54 897.9052 876.4468 919.3636
## 55 899.0032 877.7381 920.2682
## 56 900.7223 879.7381 921.7064
## 57 901.0127 880.0737 921.9517
## 58 901.7558 880.9294 922.5822
## 59 903.3369 882.7372 923.9365
## 60 903.5747 883.0078 924.1416
## 61 904.8058 884.4028 925.2088
## 62 907.9404 887.9178 927.9630
## 63 909.5084 889.6585 929.3584
## 64 911.6681 892.0388 931.2973
## 65 912.7503 893.2248 932.2757
## 66 913.2253 893.7440 932.7065
## 67 916.0535 896.8192 935.2879
## 68 918.0916 899.0192 937.1640
## 69 920.4475 901.5472 939.3478
## 70 924.2746 905.6226 942.9266
## 71 930.6245 912.3100 948.9391
## 72 930.6734 912.3611 948.9857
## 73 932.8864 914.6723 951.1005
## 74 935.5143 917.4054 953.6232
## 75 936.8921 918.8337 954.9506
## 76 942.3407 924.4529 960.2285
## 77 944.9576 927.1368 962.7783
## 78 946.6150 928.8321 964.3979
## 79 948.4625 930.7178 966.2071
## 80 949.1487 931.4172 966.8802
## 81 951.8652 934.1808 969.5496
## 82 959.9494 942.3613 977.5375
## 83 960.2070 942.6210 977.7930
## 84 964.6725 947.1146 982.2304
## 85 965.8911 948.3383 983.4439
## 86 968.7431 951.1983 986.2879
## 87 969.5261 951.9825 987.0696
## 88 973.1692 955.6267 990.7117
## 89 975.2680 957.7228 992.8133
## 90 982.0737 964.5049 999.6425
## 91 982.8311 965.2584 1000.4038
## 92 983.1030 965.5289 1000.6771
## 93 983.3958 965.8202 1000.9715
## 94 986.4588 968.8647 1004.0529
## 95 991.4619 973.8311 1009.0927
## 96 992.2452 974.6080 1009.8824
## 97 992.6673 975.0266 1010.3080
## 98 992.9007 975.2580 1010.5434
## 99 996.0859 978.4151 1013.7567
## 100 997.5677 979.8830 1015.2523
## 101 998.0999 980.4102 1015.7897
## 102 998.5923 980.8978 1016.2868
## 103 999.9081 982.2007 1017.6156
## 104 1009.5295 991.7197 1027.3393
## 105 1013.8029 995.9445 1031.6613
## 106 1014.5582 996.6912 1032.4253
## 107 1023.0566 1005.0903 1041.0228
## 108 1024.1331 1006.1542 1042.1120
## 109 1024.2983 1006.3175 1042.2792
## 110 1024.6347 1006.6499 1042.6194
## 111 1024.6844 1006.6991 1042.6697
## 112 1025.1276 1007.1371 1043.1181
## 113 1025.2338 1007.2421 1043.2256
## 114 1027.3025 1009.2866 1045.3184
## 115 1030.7718 1012.7157 1048.8279
## 116 1031.9767 1013.9068 1050.0466
## 117 1036.2311 1018.1131 1054.3491
## 118 1036.8782 1018.7530 1055.0035
## 119 1038.3879 1020.2460 1056.5298
## 120 1040.8621 1022.6933 1059.0309
## 121 1041.9993 1023.8183 1060.1803
## 122 1043.5277 1025.3305 1061.7249
## 123 1050.1701 1031.9055 1068.4347
## 124 1055.1206 1036.8093 1073.4318
## 125 1055.4238 1037.1098 1073.7378
## 126 1056.0051 1037.6858 1074.3243
## 127 1069.5108 1051.0843 1087.9372
## 128 1074.8795 1056.4189 1093.3401
## 129 1079.9056 1061.4177 1098.3935
## 130 1082.7127 1064.2116 1101.2138
## 131 1091.1128 1072.5813 1109.6443
## 132 1091.4588 1072.9263 1109.9913
## 133 1103.7872 1085.2364 1122.3379
## 134 1111.7888 1093.2429 1130.3347
## 135 1120.2297 1101.7031 1138.7562
## 136 1127.7918 1109.2944 1146.2893
## 137 1130.8198 1112.3369 1149.3028
## 138 1136.6220 1118.1713 1155.0727
## 139 1137.5323 1119.0872 1155.9774
## 140 1137.7836 1119.3401 1156.2271
## 141 1141.3509 1122.9306 1159.7712
## 142 1141.8914 1123.4748 1160.3080
## 143 1145.0768 1126.6828 1163.4707
## 144 1146.3269 1127.9423 1164.7116
## 145 1149.3677 1131.0067 1167.7288
## 146 1149.6546 1131.2958 1168.0133
## 147 1150.1469 1131.7921 1168.5017
## 148 1151.8565 1133.5157 1170.1972
## 149 1158.1970 1139.9116 1176.4825
## 150 1167.1266 1148.9275 1185.3257
## 151 1174.7724 1156.6543 1192.8905
## 152 1178.6239 1160.5488 1196.6990
## 153 1179.5270 1161.4622 1197.5918
## 154 1183.4649 1165.4457 1201.4840
## 155 1187.1565 1169.1813 1205.1316
## 156 1193.6067 1175.7111 1211.5024
## 157 1195.8015 1177.9335 1213.6695
## 158 1196.8326 1178.9777 1214.6874
## 159 1201.4171 1183.6214 1219.2127
## 160 1202.3021 1184.5180 1220.0861
## 161 1203.3121 1185.5412 1221.0829
## 162 1211.4908 1193.8294 1229.1522
## 163 1213.5565 1195.9232 1231.1898
## 164 1220.4855 1202.9479 1238.0231
## 165 1221.2791 1203.7526 1238.8057
## 166 1222.3089 1204.7968 1239.8211
## 167 1222.4828 1204.9731 1239.9926
## 168 1227.7754 1210.3402 1245.2107
## 169 1233.5254 1216.1719 1250.8789
## 170 1245.2626 1228.0776 1262.4475
## 171 1250.3259 1233.2139 1267.4378
## 172 1252.5546 1235.4749 1269.6344
## 173 1262.7268 1245.7935 1279.6601
## 174 1263.2794 1246.3541 1280.2048
## 175 1263.9813 1247.0661 1280.8966
## 176 1267.4552 1250.5896 1284.3207
## 177 1268.8302 1251.9843 1285.6761
## 178 1275.5823 1258.8323 1292.3324
## 179 1280.9700 1264.2957 1297.6443
## 180 1296.9557 1280.5007 1313.4106
## 181 1297.0129 1280.5587 1313.4671
## 182 1303.4020 1287.0327 1319.7713
## 183 1307.0152 1290.6931 1323.3374
## 184 1312.4072 1296.1543 1328.6601
## 185 1324.1460 1308.0385 1340.2536
## 186 1329.2190 1313.1717 1345.2663
## 187 1339.3750 1323.4436 1355.3064
## 188 1343.9519 1328.0704 1359.8333
## 189 1358.3201 1342.5858 1374.0545
## 190 1358.8500 1343.1208 1374.5792
## 191 1358.9272 1343.1988 1374.6557
## 192 1375.6998 1360.1223 1391.2772
## 193 1376.7390 1361.1701 1392.3079
## 194 1392.9358 1377.4894 1408.3821
## 195 1397.5125 1382.0967 1412.9283
## 196 1410.2689 1394.9287 1425.6091
## 197 1415.1756 1399.8606 1430.4906
## 198 1423.4326 1408.1553 1438.7099
## 199 1426.7138 1411.4498 1441.9778
## 200 1427.2047 1411.9426 1442.4668
## 201 1441.1065 1425.8895 1456.3236
## 202 1442.6841 1427.4711 1457.8971
## 203 1446.9871 1431.7840 1462.1903
## 204 1450.1854 1434.9886 1465.3823
## 205 1454.5901 1439.4004 1469.7798
## 206 1467.1886 1452.0103 1482.3670
## 207 1470.4595 1455.2819 1485.6372
## 208 1473.7050 1458.5272 1488.8828
## 209 1473.7749 1458.5971 1488.9527
## 210 1482.4560 1467.2734 1497.6386
## 211 1487.1618 1471.9740 1502.3496
## 212 1487.6783 1472.4898 1502.8668
## 213 1495.4332 1480.2320 1510.6344
## 214 1498.3004 1483.0933 1513.5076
## 215 1504.0993 1488.8783 1519.3202
## 216 1512.7282 1497.4819 1527.9744
## 217 1514.5746 1499.3223 1529.8270
## 218 1515.3459 1500.0910 1530.6009
## 219 1515.4832 1500.2278 1530.7387
## 220 1516.5968 1501.3375 1531.8561
## 221 1517.1910 1501.9295 1532.4524
## 222 1529.9323 1514.6198 1545.2447
## 223 1538.5240 1523.1710 1553.8769
## 224 1541.6650 1526.2960 1557.0340
## 225 1565.7534 1550.2426 1581.2642
## 226 1566.6224 1551.1059 1582.1389
## 227 1567.8744 1552.3495 1583.3992
## 228 1597.2266 1581.4862 1612.9669
## 229 1600.5025 1584.7358 1616.2692
## 230 1610.2859 1594.4384 1626.1335
## 231 1612.6194 1596.7521 1628.4867
## 232 1616.1131 1600.2159 1632.0103
## 233 1644.7069 1628.5526 1660.8612
## 234 1645.8564 1629.6914 1662.0214
## 235 1658.9868 1642.6978 1675.2757
## 236 1659.5638 1643.2693 1675.8583
## 237 1659.7860 1643.4894 1676.0826
## 238 1663.3240 1646.9936 1679.6545
## 239 1664.2895 1647.9498 1680.6292
## 240 1685.5347 1668.9891 1702.0803
## 241 1689.5712 1672.9862 1706.1562
## 242 1689.9821 1673.3931 1706.5712
## 243 1697.9084 1681.2420 1714.5749
## 244 1707.8040 1691.0410 1724.5670
## 245 1709.6098 1692.8293 1726.3904
## 246 1722.0675 1705.1661 1738.9689
## 247 1724.4944 1707.5696 1741.4191
## 248 1727.1543 1710.2040 1744.1047
## 249 1734.4021 1717.3824 1751.4217
## 250 1745.2907 1728.1682 1762.4133
## 251 1752.1779 1734.9911 1769.3646
## 252 1753.3855 1736.1876 1770.5834
## 253 1757.2831 1740.0494 1774.5169
## 254 1759.1058 1741.8553 1776.3562
## 255 1761.4145 1744.1430 1778.6859
## 256 1770.8932 1753.5364 1788.2499
## 257 1775.2380 1757.8429 1792.6332
## 258 1777.1541 1759.7421 1794.5661
## 259 1796.5967 1779.0192 1814.1743
## 260 1799.7933 1782.1895 1817.3971
## 261 1801.8982 1784.2773 1819.5192
## 262 1805.4925 1787.8426 1823.1424
## 263 1816.5415 1798.8050 1834.2781
## 264 1822.1642 1804.3849 1839.9434
## 265 1823.7654 1805.9742 1841.5566
## 266 1824.5042 1806.7075 1842.3009
## 267 1872.7377 1854.6239 1890.8515
## 268 1879.2950 1861.1448 1897.4452
## 269 1890.9392 1872.7286 1909.1499
## 270 1912.3610 1894.0525 1930.6695
## 271 1916.0770 1897.7533 1934.4007
## 272 1922.6819 1904.3322 1941.0315
## 273 1923.6421 1905.2889 1941.9954
## 274 1937.4639 1919.0619 1955.8659
## 275 1939.0853 1920.6780 1957.4925
## 276 1969.4533 1950.9628 1987.9438
## 277 1969.4707 1950.9802 1987.9612
## 278 1984.5592 1966.0380 2003.0805
## 279 1992.8913 1974.3559 2011.4267
## 280 1996.1953 1977.6549 2014.7358
## 281 2007.4052 1988.8497 2025.9607
## 282 2010.0432 1991.4847 2028.6018
## 283 2014.4842 1995.9209 2033.0476
## 284 2015.7585 1997.1939 2034.3231
## 285 2024.0456 2005.4737 2042.6175
## 286 2042.6167 2024.0343 2061.1992
## 287 2049.5739 2030.9896 2068.1583
## 288 2052.8317 2034.2467 2071.4167
## 289 2065.6891 2047.1038 2084.2743
## 290 2065.8089 2047.2236 2084.3941
## 291 2076.0466 2057.4635 2094.6298
## 292 2082.3731 2063.7921 2100.9541
## 293 2082.5666 2063.9856 2101.1475
## 294 2091.0744 2072.4974 2109.6514
## 295 2094.4717 2075.8966 2113.0467
## 296 2098.8221 2080.2497 2117.3945
## 297 2117.1253 2098.5671 2135.6834
## 298 2119.0599 2100.5035 2137.6163
## 299 2135.0881 2116.5478 2153.6284
## 300 2140.4090 2121.8747 2158.9433
## 301 2149.7599 2131.2367 2168.2832
## 302 2157.2133 2138.6994 2175.7272
## 303 2164.2363 2145.7316 2182.7410
## 304 2175.1980 2156.7082 2193.6878
## 305 2179.8993 2161.4160 2198.3825
## 306 2186.5107 2168.0370 2204.9845
## 307 2187.7111 2169.2391 2206.1832
## 308 2194.9307 2176.4692 2213.3922
## 309 2204.5618 2186.1145 2223.0090
## 310 2208.5977 2190.1565 2227.0390
## 311 2216.6289 2198.1997 2235.0581
## 312 2226.0737 2207.6587 2244.4887
## 313 2231.3707 2212.9636 2249.7777
## 314 2234.1767 2215.7738 2252.5795
## 315 2234.5434 2216.1411 2252.9457
## 316 2249.1168 2230.7360 2267.4976
## 317 2255.6727 2237.3013 2274.0440
## 318 2271.5607 2253.2115 2289.9099
## 319 2281.0034 2262.6667 2299.3400
## 320 2299.1356 2280.8213 2317.4499
## 321 2309.2163 2290.9133 2327.5194
## 322 2315.2371 2296.9404 2333.5338
## 323 2337.8975 2319.6212 2356.1738
## 324 2341.4471 2323.1734 2359.7207
## 325 2347.1534 2328.8837 2365.4230
## 326 2382.8031 2364.5485 2401.0577
## 327 2435.3409 2417.0723 2453.6094
## 328 2457.0773 2438.7885 2475.3661
## 329 2502.4651 2484.1035 2520.8266
## 330 2504.2026 2485.8374 2522.5677
## 331 2505.1221 2486.7550 2523.4892
## 332 2559.6036 2541.0875 2578.1196
## 333 2569.7509 2551.1997 2588.3020
## 334 2584.2249 2565.6196 2602.8302
## 335 2590.8242 2572.1927 2609.4558
## 336 2617.1260 2598.3798 2635.8722
## 337 2639.4510 2620.5951 2658.3070
## 338 2646.5297 2627.6366 2665.4228
## 339 2668.8137 2649.7961 2687.8312
## 340 2701.2908 2682.0723 2720.5094
## 341 2729.4017 2709.9907 2748.8126
## 342 2735.3625 2715.9086 2754.8164
## 343 2736.8153 2717.3508 2756.2797
## 344 2744.8869 2725.3629 2764.4109
## 345 2773.7232 2753.9759 2793.4705
## 346 2809.3931 2789.3476 2829.4387
## 347 2832.8582 2812.6040 2853.1125
## 348 2920.0608 2898.9549 2941.1667
## 349 2939.8471 2918.5334 2961.1608
## 350 2967.8333 2946.2178 2989.4488
## 351 2978.8479 2957.1114 3000.5845
## 352 2998.3729 2976.4188 3020.3270
## 353 3001.1933 2979.2076 3023.1791
## 354 3046.3445 3023.8420 3068.8469
## 355 3048.9207 3026.3883 3071.4531
## 356 3080.1273 3057.2290 3103.0257
## 357 3123.7216 3100.3035 3147.1398
## 358 3129.2648 3105.7799 3152.7496
## 359 3144.1091 3120.4453 3167.7729
## 360 3154.0328 3130.2490 3177.8167
## 361 3159.7841 3135.9306 3183.6376
## 362 3173.3051 3149.2875 3197.3226
## 363 3198.3112 3173.9895 3222.6330
## 364 3222.4982 3197.8818 3247.1147
## 365 3228.1590 3203.4736 3252.8445
## 366 3240.0340 3215.2039 3264.8641
## 367 3252.8637 3227.8773 3277.8500
## 368 3310.2881 3284.6052 3335.9711
## 369 3354.9152 3328.6964 3381.1339
## 370 3405.3885 3378.5725 3432.2044
## 371 3476.3661 3448.7311 3504.0012
## 372 3488.0445 3460.2774 3515.8117
## 373 3494.3783 3466.5398 3522.2167
## 374 3552.9379 3524.4524 3581.4235
## 375 3571.4390 3542.7537 3600.1243
## 376 3651.7959 3622.2710 3681.3207
## 377 3727.4207 3697.1485 3757.6928
## 378 3734.0565 3703.7208 3764.3922
## 379 3736.9189 3706.5559 3767.2820
## 380 3763.2712 3732.6593 3793.8830
## 381 3780.0442 3749.2766 3810.8118
## 382 3971.7498 3939.3327 4004.1669
## 383 3982.9112 3950.4046 4015.4179
## 384 4069.1328 4035.9507 4102.3148
## 385 4173.0696 4139.0981 4207.0412
## 386 4190.2802 4156.1786 4224.3819
## 387 4234.0701 4199.6361 4268.5041
## 388 4260.3673 4225.7319 4295.0027
## 389 4262.6372 4227.9843 4297.2901
## 390 4473.9336 4437.5185 4510.3488
## 391 4565.8056 4528.4798 4603.1313
## 392 4687.4681 4648.7158 4726.2205
## 393 4750.3538 4710.7392 4789.9684
## 394 4969.5808 4926.0990 5013.0626
## 395 4987.6054 4943.7369 5031.4740
## 396 5248.9184 5198.0616 5299.7751
## 397 5253.5425 5202.5373 5304.5476
## 398 5642.3197 5575.5448 5709.0947
## 399 6094.8633 6001.5138 6188.2128
## 400 6653.6284 6515.9397 6791.3172
plot(X, Y, xlab = "speed", main = "Regression Spline", ylab = "distance", cex = .5)
lines(X, flight_sp$fitted.values, col = "blue", lwd = 1)
lines(X, conf_interval[,2], col = "red", lty = 2)
lines(X, conf_interval[,3], col = "red", lty = 2)
fit = lm(Y~ns(X, df = 3), data = flightdata)
summary(fit)
##
## Call:
## lm(formula = Y ~ ns(X, df = 3), data = flightdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -564.04 -121.52 -3.54 130.88 541.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1069.81 59.34 18.03 <2e-16 ***
## ns(X, df = 3)1 1266.40 44.35 28.55 <2e-16 ***
## ns(X, df = 3)2 3236.23 137.77 23.49 <2e-16 ***
## ns(X, df = 3)3 5494.25 75.91 72.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 195.6 on 396 degrees of freedom
## Multiple R-squared: 0.9629, Adjusted R-squared: 0.9626
## F-statistic: 3429 on 3 and 396 DF, p-value: < 2.2e-16
anova(fit, flight_sp)
## Analysis of Variance Table
##
## Model 1: Y ~ ns(X, df = 3)
## Model 2: Y ~ bs(X, df = 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 396 15155277
## 2 394 14851546 2 303730 4.0289 0.01853 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on the comparison above, the spline model with 5 degrees of freedom has a lower sum of squared error of 14851546. As the p-value is less than 0.05, there is statistically significant differences between the predictive performance of both models. The spline model with 5 degrees of freedom is the better model to use for predicting landing distance as it has a higher adjusted r-squared error and lower residual standard error compared to the spline model with 3 degrees of freedom.
library(readr)
healthdata <- read_csv("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Exam 1/health_dataset.csv")
## Rows: 2000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Gender, Diabetes, SleepTrouble
## dbl (17): Age, Weight, Height, BMI, Pulse, BPSysAve, BPDiaAve, BPSys1, BPDia...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
healthdata$Diabetes <- as.factor(healthdata$Diabetes)
healthdata$Gender <- as.factor(healthdata$Gender)
healthdata$SleepTrouble <- as.factor(healthdata$SleepTrouble)
index <- sample(nrow(healthdata), nrow(healthdata)*0.50)
health_train <- healthdata[index,]
health_test <- healthdata[-index,]
#Decision Tree
library(rpart)
library(rpart.plot)
reg_tree <- rpart(formula = Diabetes ~ ., data = healthdata)
prp(reg_tree, digits = 4, extra = 1)
reg_tree_pred <- predict(reg_tree, health_test, type="class")
table(health_test$Diabetes, reg_tree_pred, dnn = c("True", "Predicted"))
## Predicted
## True No Yes
## No 917 2
## Yes 45 36
#Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
rf_model <- randomForest(Diabetes ~., data = health_train, importance = TRUE, ntree = 100)
rf_model
##
## Call:
## randomForest(formula = Diabetes ~ ., data = health_train, importance = TRUE, ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 6.4%
## Confusion matrix:
## No Yes class.error
## No 905 8 0.008762322
## Yes 56 31 0.643678161
rf_pred <- predict(rf_model, health_test, type="class")
table(health_test$Diabetes, rf_pred, dnn = c("True", "Predicted"))
## Predicted
## True No Yes
## No 915 4
## Yes 63 18
# Boosting Model
# library(adabag)
# health_boost = boosting(Diabetes~., data = health_train, boos = T)
# boost_pred <- predict(health_boost, health_test)
# boost_pred$confusion
My boosting model would not run. I am not sure why. I reinstalled all the packages, but it still would not run. It just gets stuck running endlessly.
library(class)
healthdata2 <- read_csv("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Exam 1/health_dataset.csv")
## Rows: 2000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Gender, Diabetes, SleepTrouble
## dbl (17): Age, Weight, Height, BMI, Pulse, BPSysAve, BPDiaAve, BPSys1, BPDia...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
index2 <- sample(nrow(healthdata2), nrow(healthdata2)*0.50)
health_train2 <- healthdata2[index2,]
health_test2 <- healthdata2[-index2,]
length(health_train2[,2:18])
## [1] 17
length(health_test2[,2:18])
## [1] 17
length(health_train2[,19])
## [1] 1
# knn_health <- knn(train = health_train2[,2:18], test = health_test2[,2:18], cl = health_train2[, 19], k = 5)
# knn_health
# table(health_test2[, 19], knn_health, dnn = c("True", "Predicted"))
# sum(health_test2[, 19] != knn_health)
My KNN model isn’t working either. Error saying train and class have different lengths. But I checked the lengths of the training and testing set and they are the same. I even tried to work with the data set without converting chr to factors. Still gave me the error. I’m not sure what is going on.
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
defaultpayments <- read_csv("C:/Users/Lynx/Documents/MSDA/MSDA 622 - Big Data/Exam 1/default_payments.csv")
## Rows: 12000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): SEX, MARRIAGE
## dbl (6): LIMIT_BAL, AGE, PAY_0, BILL_AMT1, PAY_AMT1, DEFAULT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(defaultpayments)
## spc_tbl_ [12,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ LIMIT_BAL: num [1:12000] 290000 20000 280000 280000 20000 50000 20000 30000 200000 110000 ...
## $ SEX : chr [1:12000] "F" "M" "M" "F" ...
## $ MARRIAGE : chr [1:12000] "Currently Married" "Currently Married" "Currently Married" "Currently Married" ...
## $ AGE : num [1:12000] 26 51 29 47 24 26 23 25 38 29 ...
## $ PAY_0 : num [1:12000] 0 -1 -2 0 0 -2 1 1 -2 0 ...
## $ BILL_AMT1: num [1:12000] 18125 780 10660 269124 17924 ...
## $ PAY_AMT1 : num [1:12000] 3000 0 5123 11268 1400 ...
## $ DEFAULT : num [1:12000] 0 0 0 0 0 0 1 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. LIMIT_BAL = col_double(),
## .. SEX = col_character(),
## .. MARRIAGE = col_character(),
## .. AGE = col_double(),
## .. PAY_0 = col_double(),
## .. BILL_AMT1 = col_double(),
## .. PAY_AMT1 = col_double(),
## .. DEFAULT = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
index3 <- sample(nrow(defaultpayments), nrow(defaultpayments)*0.80)
default_train <- defaultpayments[index3,]
default_test <- defaultpayments[-index3,]
default_gam <- gam(DEFAULT ~ s(LIMIT_BAL) + (SEX) + (MARRIAGE) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1), data = default_train)
summary(default_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + (SEX) + (MARRIAGE) + s(AGE) + s(PAY_0) +
## s(BILL_AMT1) + s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.201883 0.006352 31.781 < 2e-16 ***
## SEXM 0.021669 0.007805 2.776 0.00551 **
## MARRIAGESingle 0.015209 0.008518 1.786 0.07419 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 6.109 7.010 17.581 < 2e-16 ***
## s(AGE) 1.000 1.000 0.521 0.470335
## s(PAY_0) 6.690 7.395 260.288 < 2e-16 ***
## s(BILL_AMT1) 1.000 1.000 26.859 6.97e-07 ***
## s(PAY_AMT1) 5.465 6.613 3.645 0.000936 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.204 Deviance explained = 20.6%
## GCV = 0.13575 Scale est. = 0.13542 n = 9600
# As the p-value for Marriage and Age is more than 0.05, these variables are not statistically significant and will be removed.
default_gam2 <- gam(DEFAULT ~ s(LIMIT_BAL) + (SEX) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1), data = default_train)
summary(default_gam2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## DEFAULT ~ s(LIMIT_BAL) + (SEX) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.208760 0.004854 43.004 < 2e-16 ***
## SEXM 0.021805 0.007763 2.809 0.00498 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(LIMIT_BAL) 6.069 6.975 17.146 < 2e-16 ***
## s(PAY_0) 6.719 7.420 260.730 < 2e-16 ***
## s(BILL_AMT1) 1.000 1.000 28.537 2.03e-07 ***
## s(PAY_AMT1) 5.555 6.710 3.812 0.000556 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.204 Deviance explained = 20.5%
## GCV = 0.13578 Scale est. = 0.13548 n = 9600
BILL_AMT1 is related linearly to the target as the edf is = 1.
plot(default_gam2, pages = 1)
pcut_gam <- 1/2
prob_gam_out <- predict(default_gam2, default_test, type = "response")
pred_gam_out <- (prob_gam_out >= pcut_gam)*1
table(default_test$DEFAULT, pred_gam_out, dnn = c("Observed", "Predicted"))
## Predicted
## Observed 0 1
## 0 1762 93
## 1 374 171