link <- "C:/Users/Kavya/Desktop/Kavya/msds/DATA 605 Computational Mathematics/Final Project/prob1.csv"
df <- read.csv(link, header=TRUE, sep=",")
XY <- as.data.frame(c(df["X3"], df["Y2"]))
names(XY) <- c("X", "Y")
kable(XY) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| X | Y |
|---|---|
| 9.5 | 20.8 |
| 3.7 | 14.6 |
| 11.7 | 18.0 |
| 7.4 | 7.3 |
| 5.3 | 19.4 |
| 7.4 | 13.5 |
| 7.4 | 14.7 |
| 8.6 | 15.3 |
| 9.1 | 12.6 |
| 11.4 | 13.0 |
| 8.4 | 13.1 |
| 7.3 | 10.3 |
| 11.3 | 14.9 |
| 4.4 | 14.8 |
| 9.3 | 16.2 |
| 10.9 | 15.7 |
| 10.9 | 16.3 |
| 7.7 | 11.5 |
| 7.7 | 12.2 |
| 11.5 | 11.8 |
y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)
cond_a1 <- filter(XY, XY$Y>y)## Warning: package 'bindrcpp' was built under R version 3.4.4
cond_a2 <- filter(cond_a1, cond_a1$X>x)
p_1a <- nrow(cond_a2)/nrow(cond_a1)
print(sprintf("Y>y results in %g rows. Given that condition, X>x results in %g rows. The P(X>x | Y>y) is %g.", nrow(cond_a1), nrow(cond_a2), round(p_1a, 2)))## [1] "Y>y results in 15 rows. Given that condition, X>x results in 3 rows. The P(X>x | Y>y) is 0.2."
y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)
cond_b1 <- subset(XY, XY$Y>y & XY$X>x)
p_1b <- nrow(cond_b1)/nrow(XY)
print(sprintf("Filtering the dataframe for Y>y and X>x results in %g rows. Given that condition, the P(X>x, Y>y) is %g.", nrow(cond_b1), round(p_1b, 2)))## [1] "Filtering the dataframe for Y>y and X>x results in 3 rows. Given that condition, the P(X>x, Y>y) is 0.15."
y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)
cond_c1 <- filter(XY, XY$Y>y)
cond_c2 <- filter(cond_c1, cond_c1$X<x)
p_1c <- nrow(cond_c2)/nrow(cond_c1)
print(sprintf("Y>y results in %g rows. Given that condition, X<x results in %g rows. The P(X<x | Y>y) is %g.", nrow(cond_c1), nrow(cond_c2), round(p_1c, 2)))## [1] "Y>y results in 15 rows. Given that condition, X<x results in 10 rows. The P(X<x | Y>y) is 0.67."
y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)
cond_y1 <- filter(XY, XY$Y<=y)
cond_y2 <- filter(XY, XY$Y>y)
count1_1 <- nrow(filter(cond_y1, cond_y1$X<=x))
count1_2 <- nrow(filter(cond_y2, cond_y2$X<=x))
count2_1 <- nrow(filter(cond_y1, cond_y1$X>x))
count2_2 <- nrow(filter(cond_y2, cond_y2$X>x))
count_table <- matrix(data = c(count1_1, count1_2, NA, count2_1, count2_2, NA, NA, NA, NA), ncol=3, nrow=3)
count_table[1,3] <- sum(count_table[1,1], count_table[1,2])
count_table[2,3] <- sum(count_table[2,1], count_table[2,2])
count_table[3,1] <- sum(count_table[1,1], count_table[2,1])
count_table[3,2] <- sum(count_table[1,2], count_table[2,2])
count_table[3,3] <- sum(count_table[1,1], count_table[1,2]) + sum(count_table[2,1], count_table[2,2])
count_table <- as.data.frame(count_table)
names(count_table) <- c("x <= 3rd quartile", "x > 3rd quartile", "TOTAL")
rownames(count_table) <- c("y <= 1st quartile", "y > 1st quartile", "TOTAL")
kable(count_table) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| x <= 3rd quartile | x > 3rd quartile | TOTAL | |
|---|---|---|---|
| y <= 1st quartile | 4 | 1 | 5 |
| y > 1st quartile | 12 | 3 | 15 |
| TOTAL | 16 | 4 | 20 |
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations below the 3rd quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does \(P(AB)=P(A)P(B)\)? Check mathematically, and then evaluate by running a Chi Square Test for association.
Splitting the training data according to quartiles is not a good test of independence, since the split is not random – there is likely to be a pattern to the values that fall above or below a certain quartile value.
We can confirm that \(P(A) \times P(B) \neq P(AB)\), which means the two variables are not independent.
\(P(A) = 16/20 = 0.67\)
\(P(B) = 15/20 = 0.75\)
\(P(A) \times P(B) = 0.5025\)
\(P(AB) = P(A|B) \ P(B) = 12/20 \times 15/20 = 0.45\)
Finally, we can use the \(\chi^2\) test for statistical independence.
Null hypothesis: \(A\) and \(B\) are independent.
Alternate hypothesis: \(A\) and \(B\) are not independent.
We get a p-value of \(0.8926\) when running our \(\chi^2\) test, which is much greater than \(0.05\). This means that there is a 90% chance that these variables could be independent due to random chance, so we reject the null hypothesis as true. \(A\) and \(B\) are not independent variables.
chisq.test(XY)##
## Pearson's Chi-squared test
##
## data: XY
## X-squared = 11.833, df = 19, p-value = 0.8926
# Read in the training data
link <- "C:/Users/Kavya/Desktop/Kavya/msds/DATA 605 Computational Mathematics/Final Project/train.csv"
train <- read.csv(link, header=TRUE, sep=",")a | Univariate descriptive statistics and plots
Provide univariate descriptive statistics and appropriate plots for the training data set.
The train_df dataframe contains 1,460 observations, 37 predictor (independent) variables, and 1 response (dependent) variable – sale price.
After examining each variable using describe, we can see that LotFrontage may have many missing values – its \(n\)-count is about 200 observations smaller than the other variables.
d <- describe(train)
kable(d) %>%
kable_styling() %>%
scroll_box(width = "100%", height = "300px")| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | 1 | 1460 | 7.305000e+02 | 4.216100e+02 | 730.5 | 7.305000e+02 | 541.1490 | 1 | 1460 | 1459 | 0.0000000 | -1.2024660 | 11.0340382 |
| MSSubClass | 2 | 1460 | 5.689726e+01 | 4.230057e+01 | 50.0 | 4.915240e+01 | 44.4780 | 20 | 190 | 170 | 1.4047656 | 1.5644157 | 1.1070565 |
| MSZoning* | 3 | 1460 | 4.028767e+00 | 6.320174e-01 | 4.0 | 4.061644e+00 | 0.0000 | 1 | 5 | 4 | -1.7318311 | 6.2488744 | 0.0165407 |
| LotFrontage | 4 | 1201 | 7.004996e+01 | 2.428475e+01 | 69.0 | 6.894381e+01 | 16.3086 | 21 | 313 | 292 | 2.1581677 | 17.3413841 | 0.7007485 |
| LotArea | 5 | 1460 | 1.051683e+04 | 9.981265e+03 | 9478.5 | 9.563284e+03 | 2962.2348 | 1300 | 215245 | 213945 | 12.1826150 | 202.2623223 | 261.2216422 |
| Street* | 6 | 1460 | 1.995890e+00 | 6.399610e-02 | 2.0 | 2.000000e+00 | 0.0000 | 1 | 2 | 1 | -15.4868962 | 238.0069738 | 0.0016749 |
| Alley* | 7 | 91 | 1.450549e+00 | 5.003052e-01 | 1.0 | 1.438356e+00 | 0.0000 | 1 | 2 | 1 | 0.1955092 | -1.9832087 | 0.0524462 |
| LotShape* | 8 | 1460 | 2.942466e+00 | 1.409156e+00 | 4.0 | 3.053082e+00 | 0.0000 | 1 | 4 | 3 | -0.6089215 | -1.5964795 | 0.0368793 |
| LandContour* | 9 | 1460 | 3.777397e+00 | 7.076659e-01 | 4.0 | 3.997431e+00 | 0.0000 | 1 | 4 | 3 | -3.1560041 | 8.6458033 | 0.0185205 |
| Utilities* | 10 | 1460 | 1.000685e+00 | 2.617120e-02 | 1.0 | 1.000000e+00 | 0.0000 | 1 | 2 | 1 | 38.1314686 | 1453.0041082 | 0.0006849 |
| LotConfig* | 11 | 1460 | 4.019178e+00 | 1.622634e+00 | 5.0 | 4.273973e+00 | 0.0000 | 1 | 5 | 4 | -1.1332994 | -0.5853058 | 0.0424663 |
| LandSlope* | 12 | 1460 | 1.062329e+00 | 2.762325e-01 | 1.0 | 1.000000e+00 | 0.0000 | 1 | 3 | 2 | 4.8037958 | 24.4716699 | 0.0072293 |
| Neighborhood* | 13 | 1460 | 1.315000e+01 | 5.893512e+00 | 13.0 | 1.310702e+01 | 7.4130 | 1 | 25 | 24 | 0.0180075 | -1.0564682 | 0.1542403 |
| Condition1* | 14 | 1460 | 3.031507e+00 | 8.685151e-01 | 3.0 | 3.000000e+00 | 0.0000 | 1 | 9 | 8 | 3.0129949 | 16.3422050 | 0.0227301 |
| Condition2* | 15 | 1460 | 3.008219e+00 | 2.590399e-01 | 3.0 | 3.000000e+00 | 0.0000 | 1 | 8 | 7 | 13.1447909 | 247.5370605 | 0.0067794 |
| BldgType* | 16 | 1460 | 1.493151e+00 | 1.198277e+00 | 1.0 | 1.143836e+00 | 0.0000 | 1 | 5 | 4 | 2.2410358 | 3.4057174 | 0.0313603 |
| HouseStyle* | 17 | 1460 | 4.038356e+00 | 1.911305e+00 | 3.0 | 4.029966e+00 | 1.4826 | 1 | 8 | 7 | 0.3061246 | -0.9641411 | 0.0500211 |
| OverallQual | 18 | 1460 | 6.099315e+00 | 1.382996e+00 | 6.0 | 6.079623e+00 | 1.4826 | 1 | 10 | 9 | 0.2164984 | 0.0876226 | 0.0361947 |
| OverallCond | 19 | 1460 | 5.575342e+00 | 1.112799e+00 | 5.0 | 5.477740e+00 | 0.0000 | 1 | 9 | 8 | 0.6916440 | 1.0929087 | 0.0291233 |
| YearBuilt | 20 | 1460 | 1.971268e+03 | 3.020290e+01 | 1973.0 | 1.974127e+03 | 37.0650 | 1872 | 2010 | 138 | -0.6122012 | -0.4456575 | 0.7904461 |
| YearRemodAdd | 21 | 1460 | 1.984866e+03 | 2.064541e+01 | 1994.0 | 1.986369e+03 | 19.2738 | 1950 | 2010 | 60 | -0.5025278 | -1.2743655 | 0.5403150 |
| RoofStyle* | 22 | 1460 | 2.410274e+00 | 8.349978e-01 | 2.0 | 2.264555e+00 | 0.0000 | 1 | 6 | 5 | 1.4707694 | 0.6051553 | 0.0218529 |
| RoofMatl* | 23 | 1460 | 2.075343e+00 | 5.991268e-01 | 2.0 | 2.000000e+00 | 0.0000 | 1 | 8 | 7 | 8.0927465 | 66.2778413 | 0.0156799 |
| Exterior1st* | 24 | 1460 | 1.062466e+01 | 3.197659e+00 | 13.0 | 1.092979e+01 | 1.4826 | 1 | 15 | 14 | -0.7248218 | -0.3678431 | 0.0836866 |
| Exterior2nd* | 25 | 1460 | 1.133973e+01 | 3.540570e+00 | 14.0 | 1.164897e+01 | 2.9652 | 1 | 16 | 15 | -0.6915394 | -0.5240999 | 0.0926610 |
| MasVnrType* | 26 | 1452 | 2.761019e+00 | 6.157107e-01 | 3.0 | 2.728916e+00 | 0.0000 | 1 | 4 | 3 | -0.0673056 | -0.1344926 | 0.0161582 |
| MasVnrArea | 27 | 1452 | 1.036853e+02 | 1.810662e+02 | 0.0 | 6.315232e+01 | 0.0000 | 0 | 1600 | 1600 | 2.6635721 | 10.0256421 | 4.7517556 |
| ExterQual* | 28 | 1460 | 3.539726e+00 | 6.939946e-01 | 4.0 | 3.650685e+00 | 0.0000 | 1 | 4 | 3 | -1.8265061 | 3.8628097 | 0.0181627 |
| ExterCond* | 29 | 1460 | 4.733562e+00 | 7.318072e-01 | 5.0 | 4.946062e+00 | 0.0000 | 1 | 5 | 4 | -2.5600360 | 5.2908714 | 0.0191523 |
| Foundation* | 30 | 1460 | 2.396575e+00 | 7.223936e-01 | 2.0 | 2.457192e+00 | 1.4826 | 1 | 6 | 5 | 0.0910300 | 1.0164412 | 0.0189059 |
| BsmtQual* | 31 | 1423 | 3.261420e+00 | 8.677500e-01 | 3.0 | 3.432836e+00 | 1.4826 | 1 | 4 | 3 | -1.3112186 | 1.2749390 | 0.0230034 |
| BsmtCond* | 32 | 1423 | 3.812368e+00 | 6.586556e-01 | 4.0 | 4.000000e+00 | 0.0000 | 1 | 4 | 3 | -3.3947570 | 10.1393207 | 0.0174605 |
| BsmtExposure* | 33 | 1422 | 3.265119e+00 | 1.147481e+00 | 4.0 | 3.456063e+00 | 0.0000 | 1 | 4 | 3 | -1.1466827 | -0.3879998 | 0.0304296 |
| BsmtFinType1* | 34 | 1423 | 3.732256e+00 | 1.825932e+00 | 3.0 | 3.790167e+00 | 2.9652 | 1 | 6 | 5 | -0.0154188 | -1.3923444 | 0.0484041 |
| BsmtFinSF1 | 35 | 1460 | 4.436397e+02 | 4.560981e+02 | 383.5 | 3.860762e+02 | 568.5771 | 0 | 5644 | 5644 | 1.6820413 | 11.0568141 | 11.9366326 |
| BsmtFinType2* | 36 | 1422 | 5.708158e+00 | 9.363582e-01 | 6.0 | 5.978910e+00 | 0.0000 | 1 | 6 | 5 | -3.5641512 | 12.3164870 | 0.0248309 |
| BsmtFinSF2 | 37 | 1460 | 4.654932e+01 | 1.613193e+02 | 0.0 | 1.382705e+00 | 0.0000 | 0 | 1474 | 1474 | 4.2465214 | 20.0088641 | 4.2219183 |
| BsmtUnfSF | 38 | 1460 | 5.672404e+02 | 4.418670e+02 | 477.5 | 5.192885e+02 | 426.9888 | 0 | 2336 | 2336 | 0.9183784 | 0.4645113 | 11.5641868 |
| TotalBsmtSF | 39 | 1460 | 1.057429e+03 | 4.387053e+02 | 991.5 | 1.036695e+03 | 347.6697 | 0 | 6110 | 6110 | 1.5211239 | 13.1788560 | 11.4814431 |
| Heating* | 40 | 1460 | 2.036301e+00 | 2.951238e-01 | 2.0 | 2.000000e+00 | 0.0000 | 1 | 6 | 5 | 9.8348595 | 110.9795650 | 0.0077237 |
| HeatingQC* | 41 | 1460 | 2.538356e+00 | 1.739524e+00 | 1.0 | 2.422945e+00 | 0.0000 | 1 | 5 | 4 | 0.4822257 | -1.5124682 | 0.0455254 |
| CentralAir* | 42 | 1460 | 1.934932e+00 | 2.467312e-01 | 2.0 | 2.000000e+00 | 0.0000 | 1 | 2 | 1 | -3.5231347 | 10.4196162 | 0.0064573 |
| Electrical* | 43 | 1459 | 4.681974e+00 | 1.051629e+00 | 5.0 | 5.000000e+00 | 0.0000 | 1 | 5 | 4 | -3.0554792 | 7.4856348 | 0.0275318 |
| X1stFlrSF | 44 | 1460 | 1.162627e+03 | 3.865877e+02 | 1087.0 | 1.129991e+03 | 347.6697 | 334 | 4692 | 4358 | 1.3739290 | 5.7101321 | 10.1174635 |
| X2ndFlrSF | 45 | 1460 | 3.469925e+02 | 4.365284e+02 | 0.0 | 2.853639e+02 | 0.0000 | 0 | 2065 | 2065 | 0.8113600 | -0.5590240 | 11.4244713 |
| LowQualFinSF | 46 | 1460 | 5.844520e+00 | 4.862308e+01 | 0.0 | 0.000000e+00 | 0.0000 | 0 | 572 | 572 | 8.9928333 | 82.8282385 | 1.2725242 |
| GrLivArea | 47 | 1460 | 1.515464e+03 | 5.254804e+02 | 1464.0 | 1.467670e+03 | 483.3276 | 334 | 5642 | 5308 | 1.3637536 | 4.8634828 | 13.7524502 |
| BsmtFullBath | 48 | 1460 | 4.253425e-01 | 5.189106e-01 | 0.0 | 3.921233e-01 | 0.0000 | 0 | 3 | 3 | 0.5948424 | -0.8432916 | 0.0135805 |
| BsmtHalfBath | 49 | 1460 | 5.753420e-02 | 2.387526e-01 | 0.0 | 0.000000e+00 | 0.0000 | 0 | 2 | 2 | 4.0949749 | 16.3099569 | 0.0062484 |
| FullBath | 50 | 1460 | 1.565068e+00 | 5.509158e-01 | 2.0 | 1.560788e+00 | 0.0000 | 0 | 3 | 3 | 0.0364865 | -0.8611503 | 0.0144181 |
| HalfBath | 51 | 1460 | 3.828767e-01 | 5.028854e-01 | 0.0 | 3.433219e-01 | 0.0000 | 0 | 2 | 2 | 0.6745093 | -1.0799824 | 0.0131611 |
| BedroomAbvGr | 52 | 1460 | 2.866438e+00 | 8.157780e-01 | 3.0 | 2.852740e+00 | 0.0000 | 0 | 8 | 8 | 0.2113551 | 2.2119881 | 0.0213499 |
| KitchenAbvGr | 53 | 1460 | 1.046575e+00 | 2.203382e-01 | 1.0 | 1.000000e+00 | 0.0000 | 0 | 3 | 3 | 4.4791783 | 21.4211386 | 0.0057665 |
| KitchenQual* | 54 | 1460 | 3.339726e+00 | 8.301613e-01 | 4.0 | 3.504281e+00 | 0.0000 | 1 | 4 | 3 | -1.4198855 | 1.7156248 | 0.0217263 |
| TotRmsAbvGrd | 55 | 1460 | 6.517808e+00 | 1.625393e+00 | 6.0 | 6.408390e+00 | 1.4826 | 2 | 14 | 12 | 0.6749517 | 0.8683368 | 0.0425385 |
| Functional* | 56 | 1460 | 6.749315e+00 | 9.796592e-01 | 7.0 | 7.000000e+00 | 0.0000 | 1 | 7 | 6 | -4.0765682 | 16.3748321 | 0.0256389 |
| Fireplaces | 57 | 1460 | 6.130137e-01 | 6.446664e-01 | 1.0 | 5.342466e-01 | 1.4826 | 0 | 3 | 3 | 0.6482311 | -0.2244068 | 0.0168717 |
| FireplaceQu* | 58 | 770 | 3.733766e+00 | 1.132578e+00 | 3.0 | 3.798701e+00 | 1.4826 | 1 | 5 | 4 | -0.1578800 | -0.9845157 | 0.0408153 |
| GarageType* | 59 | 1379 | 3.279188e+00 | 1.785588e+00 | 2.0 | 3.105882e+00 | 0.0000 | 1 | 6 | 5 | 0.7618616 | -1.3020231 | 0.0480838 |
| GarageYrBlt | 60 | 1379 | 1.978506e+03 | 2.468972e+01 | 1980.0 | 1.981065e+03 | 31.1346 | 1900 | 2010 | 110 | -0.6480025 | -0.4249123 | 0.6648660 |
| GarageFinish* | 61 | 1379 | 2.183466e+00 | 8.128963e-01 | 2.0 | 2.228959e+00 | 1.4826 | 1 | 3 | 2 | -0.3465419 | -1.4058467 | 0.0218904 |
| GarageCars | 62 | 1460 | 1.767123e+00 | 7.473150e-01 | 2.0 | 1.773973e+00 | 0.0000 | 0 | 4 | 4 | -0.3418454 | 0.2117307 | 0.0195581 |
| GarageArea | 63 | 1460 | 4.729801e+02 | 2.138048e+02 | 480.0 | 4.698082e+02 | 177.9120 | 0 | 1418 | 1418 | 0.1796113 | 0.9044687 | 5.5955284 |
| GarageQual* | 64 | 1379 | 4.864395e+00 | 6.105280e-01 | 5.0 | 5.000000e+00 | 0.0000 | 1 | 5 | 4 | -4.4312099 | 18.2507961 | 0.0164408 |
| GarageCond* | 65 | 1379 | 4.899928e+00 | 5.224912e-01 | 5.0 | 5.000000e+00 | 0.0000 | 1 | 5 | 4 | -5.2754778 | 26.7731497 | 0.0140701 |
| PavedDrive* | 66 | 1460 | 2.856164e+00 | 4.965919e-01 | 3.0 | 3.000000e+00 | 0.0000 | 1 | 3 | 2 | -3.3021422 | 9.2205389 | 0.0129964 |
| WoodDeckSF | 67 | 1460 | 9.424452e+01 | 1.253388e+02 | 0.0 | 7.175771e+01 | 0.0000 | 0 | 857 | 857 | 1.5382100 | 2.9704171 | 3.2802662 |
| OpenPorchSF | 68 | 1460 | 4.666027e+01 | 6.625603e+01 | 25.0 | 3.323288e+01 | 37.0650 | 0 | 547 | 547 | 2.3594857 | 8.4414910 | 1.7339995 |
| EnclosedPorch | 69 | 1460 | 2.195411e+01 | 6.111915e+01 | 0.0 | 3.866438e+00 | 0.0000 | 0 | 552 | 552 | 3.0835258 | 10.3726341 | 1.5995612 |
| X3SsnPorch | 70 | 1460 | 3.409589e+00 | 2.931733e+01 | 0.0 | 0.000000e+00 | 0.0000 | 0 | 508 | 508 | 10.2831784 | 123.0623116 | 0.7672696 |
| ScreenPorch | 71 | 1460 | 1.506096e+01 | 5.575742e+01 | 0.0 | 0.000000e+00 | 0.0000 | 0 | 480 | 480 | 4.1137473 | 18.3426076 | 1.4592383 |
| PoolArea | 72 | 1460 | 2.758904e+00 | 4.017731e+01 | 0.0 | 0.000000e+00 | 0.0000 | 0 | 738 | 738 | 14.7979183 | 222.1917078 | 1.0514882 |
| PoolQC* | 73 | 7 | 2.142857e+00 | 8.997354e-01 | 2.0 | 2.142857e+00 | 1.4826 | 1 | 3 | 2 | -0.2161500 | -1.9030436 | 0.3400680 |
| Fence* | 74 | 281 | 2.427046e+00 | 8.634533e-01 | 3.0 | 2.484444e+00 | 0.0000 | 1 | 4 | 3 | -0.5712072 | -0.8825197 | 0.0515093 |
| MiscFeature* | 75 | 54 | 2.907407e+00 | 4.458834e-01 | 3.0 | 3.000000e+00 | 0.0000 | 1 | 4 | 3 | -2.9309302 | 10.7075632 | 0.0606770 |
| MiscVal | 76 | 1460 | 4.348904e+01 | 4.961230e+02 | 0.0 | 0.000000e+00 | 0.0000 | 0 | 15500 | 15500 | 24.4265224 | 697.6400721 | 12.9841330 |
| MoSold | 77 | 1460 | 6.321918e+00 | 2.703626e+00 | 6.0 | 6.252568e+00 | 2.9652 | 1 | 12 | 11 | 0.2116175 | -0.4103846 | 0.0707571 |
| YrSold | 78 | 1460 | 2.007816e+03 | 1.328095e+00 | 2008.0 | 2.007770e+03 | 1.4826 | 2006 | 2010 | 4 | 0.0960708 | -1.1931116 | 0.0347578 |
| SaleType* | 79 | 1460 | 8.509589e+00 | 1.560932e+00 | 9.0 | 8.922089e+00 | 0.0000 | 1 | 9 | 8 | -3.8316025 | 14.5748669 | 0.0408515 |
| SaleCondition* | 80 | 1460 | 4.770548e+00 | 1.100854e+00 | 5.0 | 5.000000e+00 | 0.0000 | 1 | 6 | 5 | -2.7355374 | 6.8235962 | 0.0288107 |
| SalePrice | 81 | 1460 | 1.809212e+05 | 7.944250e+04 | 163000.0 | 1.707833e+05 | 56338.8000 | 34900 | 755000 | 720100 | 1.8790086 | 6.4967893 | 2079.1053240 |
I limited the variables to those that were numeric. When plotting their distributions, I noticed many are right-skewed, suggesting that they may need a transform to approach normal.
# Select numeric variables and gather
gathered <- select_if(train, is.numeric) %>% gather()
# Plot distributions of variables
distribution_plot <- gathered %>%
ggplot(aes(value)) +
facet_wrap( ~ key, scales = "free") +
ggtitle("Housing Dataset Density Plots (38 Numeric Variables)") +
geom_density() +
theme(
strip.text = element_text(size = 8),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.text = element_blank()
)
distribution_plotb | Scatterplot matrix
Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
First Floor Square Feet and Overall Quality Rating appear to have a positive relationship to Sale Price, which makes intuitive sense. A home with more square feet and a higher quality rating can demand a higher price.
to_matrix <- data.matrix(dplyr::select(train, SalePrice, X1stFlrSF, OverallQual))
pairs(~SalePrice + X1stFlrSF + OverallQual, data = to_matrix)c | Correlation matrix
Derive a correlation matrix for any THREE quantitative variables in the dataset.
This correlation matrix evaluates the strength of the relationship between Sales Price and three variables: 1st Floor Square Feet, Basement Square Feet, and Overall Quality Rating. Of the three variables, Overall Quality is most strongly correlated with Sales Price.
to_corr <- dplyr::select(train, X1stFlrSF, TotalBsmtSF, OverallQual, SalePrice)
corr <- cor(to_corr, method = "pearson", use = "complete.obs")
corr## X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF 1.0000000 0.8195300 0.4762238 0.6058522
## TotalBsmtSF 0.8195300 1.0000000 0.5378085 0.6135806
## OverallQual 0.4762238 0.5378085 1.0000000 0.7909816
## SalePrice 0.6058522 0.6135806 0.7909816 1.0000000
corrplot(corr, type = "upper", addCoef.col = "white", tl.srt=45, tl.col = "black")d | Hypothesis testing
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval.
Null: the true correlation between each variable and Sale Price is equal to zero.
Alternate: the true correlation is not equal to zero.
We are 80% confident that this variable’s true correlation to Sale Price is between 0.58 and 0.63. Since the p-value is less than 0.2, we can reject the null hypothesis and accept the alternate as true.
FirstFloorSqFt <- as.matrix(dplyr::select(train, X1stFlrSF))
SalePrice <- as.matrix(dplyr::select(train, SalePrice))
cor.test(FirstFloorSqFt, SalePrice, conf.level=0.8)##
## Pearson's product-moment correlation
##
## data: FirstFloorSqFt and SalePrice
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5841687 0.6266715
## sample estimates:
## cor
## 0.6058522
We are 80% confident that this variable’s true correlation to Sale Price is between 0.59 and 0.63. Since the p-value is less than 0.2, we can reject the null hypothesis and accept the alternate as true.
BasementSqFt <- as.matrix(dplyr::select(train, TotalBsmtSF))
SalePrice <- as.matrix(dplyr::select(train, SalePrice))
cor.test(BasementSqFt, SalePrice, conf.level=0.8)##
## Pearson's product-moment correlation
##
## data: BasementSqFt and SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
We are 80% confident that this variable’s true correlation to Sale Price is between 0.78 and 0.80. Since the p-value is less than 0.2, we can reject the null hypothesis and accept the alternate as true.
OverallQuality <- as.matrix(dplyr::select(train, OverallQual))
SalePrice <- as.matrix(dplyr::select(train, SalePrice))
cor.test(OverallQuality, SalePrice, conf.level=0.8)##
## Pearson's product-moment correlation
##
## data: OverallQuality and SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7780752 0.8032204
## sample estimates:
## cor
## 0.7909816
e | Analysis
Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
Familywise error refers to the likelihood of encountering a Type I error – a false positive – in a series of hypothesis tests. The probability of this seems low, since the p-value of each of our correlations above are so small.
a | Invert the correlation matrix
# Original correlation matrix
corr## X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF 1.0000000 0.8195300 0.4762238 0.6058522
## TotalBsmtSF 0.8195300 1.0000000 0.5378085 0.6135806
## OverallQual 0.4762238 0.5378085 1.0000000 0.7909816
## SalePrice 0.6058522 0.6135806 0.7909816 1.0000000
# Inverse of correlation matrix
corr_inverse <- solve(corr)
corr_inverse## X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF 3.2585965 -2.3703774 0.3583515 -0.8032598
## TotalBsmtSF -2.3703774 3.3472154 -0.4881843 -0.2315431
## OverallQual 0.3583515 -0.4881843 2.7426122 -2.0869234
## SalePrice -0.8032598 -0.2315431 -2.0869234 3.2794450
b | Check precision and correlation matrices
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
First, corr times corr_inverse:
a <- corr %*% corr_inverse
round(a)## X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF 1 0 0 0
## TotalBsmtSF 0 1 0 0
## OverallQual 0 0 1 0
## SalePrice 0 0 0 1
Then, corr_inverse times corr:
b <- corr_inverse %*% corr
round(b)## X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF 1 0 0 0
## TotalBsmtSF 0 1 0 0
## OverallQual 0 0 1 0
## SalePrice 0 0 0 1
Both calculations result in 1’s along the diagonal and values very close to zero in every other position – the identity matrix.
c | LU decomposition
lu_corr <- lu.decomposition(corr)
lu_corr## $L
## [,1] [,2] [,3] [,4]
## [1,] 1.0000000 0.0000000 0.0000000 0
## [2,] 0.8195300 1.0000000 0.0000000 0
## [3,] 0.4762238 0.4492753 1.0000000 0
## [4,] 0.6058522 0.3565073 0.6363648 1
##
## $U
## [,1] [,2] [,3] [,4]
## [1,] 1 0.8195300 0.4762238 0.6058522
## [2,] 0 0.3283706 0.1475288 0.1170665
## [3,] 0 0.0000000 0.7069298 0.4498653
## [4,] 0 0.0000000 0.0000000 0.3049296
# Check that the decomposition results in the original correlation matrix
lu_check <- lu_corr$L %*% lu_corr$U
corr == lu_check## X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF TRUE TRUE TRUE TRUE
## TotalBsmtSF TRUE TRUE TRUE TRUE
## OverallQual TRUE TRUE TRUE TRUE
## SalePrice TRUE TRUE TRUE TRUE
a | Select and shift variable
Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
I chose the GrLivArea variable – the square feet of above grade (ground) living area. Before transformation, the variable was skewed to the right. After centering the variable at zero and taking the absolute value, the variable more closely resembles an exponential distribution.
# Original variable
hist(train$GrLivArea, breaks=50, freq = FALSE, main = "Histogram of LivingArea")# Tranform variable
LivingArea <- abs(scale(train$GrLivArea))
hist(LivingArea, breaks=50, freq = FALSE)b | Fit Exponential PDF
Then load the
MASSpackage and runfitdistrto fit an exponential probability density function. Take 1000 samples from this exponential distribution using this value. Plot a histogram and compare it with a histogram of your original variable.
When fitted to an exponential distribution, the estimated rate of change \(\lambda\) is about \(1.32\).
f <- fitdistr(LivingArea, densfun = "exponential")
rate_est <- f$estimate
rate_est## rate
## 1.322546
We can take 1,000 samples of an exponential function using the rate of change.
distribution <- rexp(1000, rate_est)
hist(distribution, freq = FALSE, breaks=50, main = "Exponential PDF Model")
curve(dexp(x, rate = rate_est), col = "red", add = TRUE)We see the exponential PDF model fits the transformed Living Area variable pretty well.
hist(LivingArea, breaks=50, freq = FALSE, main = "Exponential PDF Model Fitted to Living Area")
curve(dexp(x, rate = rate_est), col = "red", add = TRUE)c | Find percentiles and CI
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).
qexp(0.05, rate = rate_est)## [1] 0.03878376
qexp(0.95, rate = rate_est)## [1] 2.265126
d | Empirical percentiles
Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
LivingArea_norm <- log(train$GrLivArea)
hist(LivingArea_norm, breaks=50, freq = FALSE)qnorm(0.05)## [1] -1.644854
qnorm(0.95)## [1] 1.644854
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
First, I restricted the data to numeric variables. Then, as a baseline, a created a model with all of the variables, and examined the residuals.
The complete-variable model results in an Adjusted R-Squared of 0.8036, which is pretty good. However, since we’re using a lot of variables, there’s a chance the data is being overfit.
train_model <- select_if(train, is.numeric)
model <- lm(data=train_model, SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold)
summary(model)##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea +
## BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF +
## X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath +
## FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
## Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF +
## OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch +
## PoolArea + MiscVal + MoSold + YrSold, data = train_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442865 -16873 -2581 14998 318042
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.232e+05 1.701e+06 -0.190 0.849317
## MSSubClass -2.005e+02 3.449e+01 -5.814 8.03e-09 ***
## LotFrontage -1.161e+02 6.124e+01 -1.896 0.058203 .
## LotArea 5.454e-01 1.573e-01 3.466 0.000548 ***
## OverallQual 1.870e+04 1.478e+03 12.646 < 2e-16 ***
## OverallCond 5.227e+03 1.367e+03 3.824 0.000139 ***
## YearBuilt 3.170e+02 8.762e+01 3.617 0.000311 ***
## YearRemodAdd 1.206e+02 8.661e+01 1.392 0.164174
## MasVnrArea 3.160e+01 7.006e+00 4.511 7.15e-06 ***
## BsmtFinSF1 1.739e+01 5.835e+00 2.980 0.002947 **
## BsmtFinSF2 8.362e+00 8.763e+00 0.954 0.340205
## BsmtUnfSF 5.006e+00 5.275e+00 0.949 0.342890
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.591e+01 7.356e+00 6.241 6.21e-10 ***
## X2ndFlrSF 4.668e+01 6.099e+00 7.654 4.28e-14 ***
## LowQualFinSF 3.415e+01 2.788e+01 1.225 0.220788
## GrLivArea NA NA NA NA
## BsmtFullBath 8.980e+03 3.194e+03 2.812 0.005018 **
## BsmtHalfBath 2.490e+03 5.071e+03 0.491 0.623487
## FullBath 5.390e+03 3.529e+03 1.527 0.126941
## HalfBath -1.119e+03 3.320e+03 -0.337 0.736244
## BedroomAbvGr -1.023e+04 2.154e+03 -4.750 2.30e-06 ***
## KitchenAbvGr -2.193e+04 6.704e+03 -3.271 0.001105 **
## TotRmsAbvGrd 5.440e+03 1.486e+03 3.661 0.000263 ***
## Fireplaces 4.375e+03 2.188e+03 2.000 0.045793 *
## GarageYrBlt -4.914e+01 9.093e+01 -0.540 0.589011
## GarageCars 1.679e+04 3.487e+03 4.815 1.68e-06 ***
## GarageArea 6.488e+00 1.211e+01 0.536 0.592338
## WoodDeckSF 2.155e+01 1.002e+01 2.151 0.031713 *
## OpenPorchSF -2.315e+00 1.948e+01 -0.119 0.905404
## EnclosedPorch 7.233e+00 2.061e+01 0.351 0.725733
## X3SsnPorch 3.458e+01 3.749e+01 0.922 0.356593
## ScreenPorch 5.797e+01 2.040e+01 2.842 0.004572 **
## PoolArea -6.126e+01 2.984e+01 -2.053 0.040326 *
## MiscVal -3.850e+00 6.955e+00 -0.554 0.579980
## MoSold -2.240e+02 4.227e+02 -0.530 0.596213
## YrSold -2.536e+02 8.454e+02 -0.300 0.764216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36790 on 1086 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.8095, Adjusted R-squared: 0.8036
## F-statistic: 135.7 on 34 and 1086 DF, p-value: < 2.2e-16
When we look at plots of the complete-variable model, we see that there are outliers that are influencing the results, but on the whole the residuals are close to normal. However, I’ll if I can improve it by performing backwards-substitution.
par(mfrow=c(2,2))
plot(model)After a few rounds of removing variables with low p-values, I’m left with 12 variables and an Adjusted R-Squared of 0.7717. That means that these 12 variables explain about 77% of the variability in Sale Price.
model_1 <- lm(data=train_model, SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageCars + ScreenPorch)
summary(model_1)##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual +
## OverallCond + YearBuilt + MasVnrArea + BsmtFullBath + BedroomAbvGr +
## KitchenAbvGr + TotRmsAbvGrd + GarageCars + ScreenPorch, data = train_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -321953 -20460 -3078 15431 390991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.464e+05 9.482e+04 -7.872 6.82e-15 ***
## MSSubClass -1.536e+02 2.512e+01 -6.115 1.24e-09 ***
## LotArea 8.078e-01 1.048e-01 7.711 2.32e-14 ***
## OverallQual 2.489e+04 1.132e+03 21.977 < 2e-16 ***
## OverallCond 4.570e+03 9.929e+02 4.603 4.52e-06 ***
## YearBuilt 3.317e+02 4.797e+01 6.916 6.98e-12 ***
## MasVnrArea 4.979e+01 6.190e+00 8.043 1.81e-15 ***
## BsmtFullBath 1.747e+04 2.007e+03 8.705 < 2e-16 ***
## BedroomAbvGr -9.825e+03 1.752e+03 -5.609 2.44e-08 ***
## KitchenAbvGr -1.779e+04 5.284e+03 -3.368 0.000778 ***
## TotRmsAbvGrd 1.605e+04 1.046e+03 15.355 < 2e-16 ***
## GarageCars 1.406e+04 1.823e+03 7.713 2.28e-14 ***
## ScreenPorch 6.545e+01 1.802e+01 3.632 0.000291 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37880 on 1439 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7736, Adjusted R-squared: 0.7717
## F-statistic: 409.7 on 12 and 1439 DF, p-value: < 2.2e-16
The new model appears to be closer to normal than the prior one, although there are still outliers influencing the results.
par(mfrow=c(2,2))
plot(model_1)Now, I can read in the test dataset, run a prediction using the new model, and post to Kaggle.
link <- "C:/Users/Kavya/Desktop/Kavya/msds/DATA 605 Computational Mathematics/Final Project/test.csv"
test_data <- read.csv(link, header=TRUE, sep=",")
test_data$SalePrice <- predict(model_1, newdata = test_data, type = "response")
headers <- c("Id", "SalePrice")
test_export <- test_data[headers]
write.csv(test_export, file="kaggle_submission.csv", row.names=FALSE, na="0")