PROBLEM 1


link <- "C:/Users/Kavya/Desktop/Kavya/msds/DATA 605 Computational Mathematics/Final Project/prob1.csv"

df <- read.csv(link, header=TRUE, sep=",")

XY <- as.data.frame(c(df["X3"], df["Y2"]))
names(XY) <- c("X", "Y")

kable(XY) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
X Y
9.5 20.8
3.7 14.6
11.7 18.0
7.4 7.3
5.3 19.4
7.4 13.5
7.4 14.7
8.6 15.3
9.1 12.6
11.4 13.0
8.4 13.1
7.3 10.3
11.3 14.9
4.4 14.8
9.3 16.2
10.9 15.7
10.9 16.3
7.7 11.5
7.7 12.2
11.5 11.8

A. P(X>x | Y>y)

y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)

cond_a1 <- filter(XY, XY$Y>y)
## Warning: package 'bindrcpp' was built under R version 3.4.4
cond_a2 <- filter(cond_a1, cond_a1$X>x)

p_1a <- nrow(cond_a2)/nrow(cond_a1)

print(sprintf("Y>y results in %g rows. Given that condition, X>x results in %g rows. The P(X>x | Y>y) is %g.", nrow(cond_a1), nrow(cond_a2), round(p_1a, 2)))
## [1] "Y>y results in 15 rows. Given that condition, X>x results in 3 rows. The P(X>x | Y>y) is 0.2."



B. P(X>x, Y>y)

y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)

cond_b1 <- subset(XY, XY$Y>y & XY$X>x)

p_1b <- nrow(cond_b1)/nrow(XY)

print(sprintf("Filtering the dataframe for Y>y and X>x results in %g rows. Given that condition, the P(X>x, Y>y) is %g.", nrow(cond_b1), round(p_1b, 2)))
## [1] "Filtering the dataframe for Y>y and X>x results in 3 rows. Given that condition, the P(X>x, Y>y) is 0.15."



C. P(X<x | Y>y)

y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)

cond_c1 <- filter(XY, XY$Y>y)
cond_c2 <- filter(cond_c1, cond_c1$X<x)

p_1c <- nrow(cond_c2)/nrow(cond_c1)

print(sprintf("Y>y results in %g rows. Given that condition, X<x results in %g rows. The P(X<x | Y>y) is %g.", nrow(cond_c1), nrow(cond_c2), round(p_1c, 2)))
## [1] "Y>y results in 15 rows. Given that condition, X<x results in 10 rows. The P(X<x | Y>y) is 0.67."



D. Table of Counts

y <- quantile(XY$Y, 0.25)
x <- quantile(XY$X, 0.75)

cond_y1 <- filter(XY, XY$Y<=y)
cond_y2 <- filter(XY, XY$Y>y)

count1_1 <- nrow(filter(cond_y1, cond_y1$X<=x))
count1_2 <- nrow(filter(cond_y2, cond_y2$X<=x))
count2_1 <- nrow(filter(cond_y1, cond_y1$X>x))
count2_2 <- nrow(filter(cond_y2, cond_y2$X>x))

count_table <- matrix(data = c(count1_1, count1_2, NA, count2_1, count2_2, NA, NA, NA, NA), ncol=3, nrow=3)

count_table[1,3] <- sum(count_table[1,1], count_table[1,2])
count_table[2,3] <- sum(count_table[2,1], count_table[2,2])
count_table[3,1] <- sum(count_table[1,1], count_table[2,1])
count_table[3,2] <- sum(count_table[1,2], count_table[2,2])
count_table[3,3] <- sum(count_table[1,1], count_table[1,2]) + sum(count_table[2,1], count_table[2,2])

count_table <- as.data.frame(count_table)

names(count_table) <- c("x <= 3rd quartile", "x > 3rd quartile", "TOTAL")
rownames(count_table) <- c("y <= 1st quartile", "y > 1st quartile", "TOTAL")

kable(count_table) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
x <= 3rd quartile x > 3rd quartile TOTAL
y <= 1st quartile 4 1 5
y > 1st quartile 12 3 15
TOTAL 16 4 20



E. Tests for Independence

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations below the 3rd quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does \(P(AB)=P(A)P(B)\)? Check mathematically, and then evaluate by running a Chi Square Test for association.

Splitting the training data according to quartiles is not a good test of independence, since the split is not random – there is likely to be a pattern to the values that fall above or below a certain quartile value.


We can confirm that \(P(A) \times P(B) \neq P(AB)\), which means the two variables are not independent.

  • \(P(A) = 16/20 = 0.67\)

  • \(P(B) = 15/20 = 0.75\)

  • \(P(A) \times P(B) = 0.5025\)

  • \(P(AB) = P(A|B) \ P(B) = 12/20 \times 15/20 = 0.45\)


Finally, we can use the \(\chi^2\) test for statistical independence.

  • Null hypothesis: \(A\) and \(B\) are independent.

  • Alternate hypothesis: \(A\) and \(B\) are not independent.


We get a p-value of \(0.8926\) when running our \(\chi^2\) test, which is much greater than \(0.05\). This means that there is a 90% chance that these variables could be independent due to random chance, so we reject the null hypothesis as true. \(A\) and \(B\) are not independent variables.

chisq.test(XY)
## 
##  Pearson's Chi-squared test
## 
## data:  XY
## X-squared = 11.833, df = 19, p-value = 0.8926



PROBLEM 2


A. Descriptive and Inferential Statistics

# Read in the training data

link <- "C:/Users/Kavya/Desktop/Kavya/msds/DATA 605 Computational Mathematics/Final Project/train.csv"

train <- read.csv(link, header=TRUE, sep=",")

a | Univariate descriptive statistics and plots

Provide univariate descriptive statistics and appropriate plots for the training data set.

The train_df dataframe contains 1,460 observations, 37 predictor (independent) variables, and 1 response (dependent) variable – sale price.


After examining each variable using describe, we can see that LotFrontage may have many missing values – its \(n\)-count is about 200 observations smaller than the other variables.

d <- describe(train)

kable(d) %>%
  kable_styling() %>%
  scroll_box(width = "100%", height = "300px")
vars n mean sd median trimmed mad min max range skew kurtosis se
Id 1 1460 7.305000e+02 4.216100e+02 730.5 7.305000e+02 541.1490 1 1460 1459 0.0000000 -1.2024660 11.0340382
MSSubClass 2 1460 5.689726e+01 4.230057e+01 50.0 4.915240e+01 44.4780 20 190 170 1.4047656 1.5644157 1.1070565
MSZoning* 3 1460 4.028767e+00 6.320174e-01 4.0 4.061644e+00 0.0000 1 5 4 -1.7318311 6.2488744 0.0165407
LotFrontage 4 1201 7.004996e+01 2.428475e+01 69.0 6.894381e+01 16.3086 21 313 292 2.1581677 17.3413841 0.7007485
LotArea 5 1460 1.051683e+04 9.981265e+03 9478.5 9.563284e+03 2962.2348 1300 215245 213945 12.1826150 202.2623223 261.2216422
Street* 6 1460 1.995890e+00 6.399610e-02 2.0 2.000000e+00 0.0000 1 2 1 -15.4868962 238.0069738 0.0016749
Alley* 7 91 1.450549e+00 5.003052e-01 1.0 1.438356e+00 0.0000 1 2 1 0.1955092 -1.9832087 0.0524462
LotShape* 8 1460 2.942466e+00 1.409156e+00 4.0 3.053082e+00 0.0000 1 4 3 -0.6089215 -1.5964795 0.0368793
LandContour* 9 1460 3.777397e+00 7.076659e-01 4.0 3.997431e+00 0.0000 1 4 3 -3.1560041 8.6458033 0.0185205
Utilities* 10 1460 1.000685e+00 2.617120e-02 1.0 1.000000e+00 0.0000 1 2 1 38.1314686 1453.0041082 0.0006849
LotConfig* 11 1460 4.019178e+00 1.622634e+00 5.0 4.273973e+00 0.0000 1 5 4 -1.1332994 -0.5853058 0.0424663
LandSlope* 12 1460 1.062329e+00 2.762325e-01 1.0 1.000000e+00 0.0000 1 3 2 4.8037958 24.4716699 0.0072293
Neighborhood* 13 1460 1.315000e+01 5.893512e+00 13.0 1.310702e+01 7.4130 1 25 24 0.0180075 -1.0564682 0.1542403
Condition1* 14 1460 3.031507e+00 8.685151e-01 3.0 3.000000e+00 0.0000 1 9 8 3.0129949 16.3422050 0.0227301
Condition2* 15 1460 3.008219e+00 2.590399e-01 3.0 3.000000e+00 0.0000 1 8 7 13.1447909 247.5370605 0.0067794
BldgType* 16 1460 1.493151e+00 1.198277e+00 1.0 1.143836e+00 0.0000 1 5 4 2.2410358 3.4057174 0.0313603
HouseStyle* 17 1460 4.038356e+00 1.911305e+00 3.0 4.029966e+00 1.4826 1 8 7 0.3061246 -0.9641411 0.0500211
OverallQual 18 1460 6.099315e+00 1.382996e+00 6.0 6.079623e+00 1.4826 1 10 9 0.2164984 0.0876226 0.0361947
OverallCond 19 1460 5.575342e+00 1.112799e+00 5.0 5.477740e+00 0.0000 1 9 8 0.6916440 1.0929087 0.0291233
YearBuilt 20 1460 1.971268e+03 3.020290e+01 1973.0 1.974127e+03 37.0650 1872 2010 138 -0.6122012 -0.4456575 0.7904461
YearRemodAdd 21 1460 1.984866e+03 2.064541e+01 1994.0 1.986369e+03 19.2738 1950 2010 60 -0.5025278 -1.2743655 0.5403150
RoofStyle* 22 1460 2.410274e+00 8.349978e-01 2.0 2.264555e+00 0.0000 1 6 5 1.4707694 0.6051553 0.0218529
RoofMatl* 23 1460 2.075343e+00 5.991268e-01 2.0 2.000000e+00 0.0000 1 8 7 8.0927465 66.2778413 0.0156799
Exterior1st* 24 1460 1.062466e+01 3.197659e+00 13.0 1.092979e+01 1.4826 1 15 14 -0.7248218 -0.3678431 0.0836866
Exterior2nd* 25 1460 1.133973e+01 3.540570e+00 14.0 1.164897e+01 2.9652 1 16 15 -0.6915394 -0.5240999 0.0926610
MasVnrType* 26 1452 2.761019e+00 6.157107e-01 3.0 2.728916e+00 0.0000 1 4 3 -0.0673056 -0.1344926 0.0161582
MasVnrArea 27 1452 1.036853e+02 1.810662e+02 0.0 6.315232e+01 0.0000 0 1600 1600 2.6635721 10.0256421 4.7517556
ExterQual* 28 1460 3.539726e+00 6.939946e-01 4.0 3.650685e+00 0.0000 1 4 3 -1.8265061 3.8628097 0.0181627
ExterCond* 29 1460 4.733562e+00 7.318072e-01 5.0 4.946062e+00 0.0000 1 5 4 -2.5600360 5.2908714 0.0191523
Foundation* 30 1460 2.396575e+00 7.223936e-01 2.0 2.457192e+00 1.4826 1 6 5 0.0910300 1.0164412 0.0189059
BsmtQual* 31 1423 3.261420e+00 8.677500e-01 3.0 3.432836e+00 1.4826 1 4 3 -1.3112186 1.2749390 0.0230034
BsmtCond* 32 1423 3.812368e+00 6.586556e-01 4.0 4.000000e+00 0.0000 1 4 3 -3.3947570 10.1393207 0.0174605
BsmtExposure* 33 1422 3.265119e+00 1.147481e+00 4.0 3.456063e+00 0.0000 1 4 3 -1.1466827 -0.3879998 0.0304296
BsmtFinType1* 34 1423 3.732256e+00 1.825932e+00 3.0 3.790167e+00 2.9652 1 6 5 -0.0154188 -1.3923444 0.0484041
BsmtFinSF1 35 1460 4.436397e+02 4.560981e+02 383.5 3.860762e+02 568.5771 0 5644 5644 1.6820413 11.0568141 11.9366326
BsmtFinType2* 36 1422 5.708158e+00 9.363582e-01 6.0 5.978910e+00 0.0000 1 6 5 -3.5641512 12.3164870 0.0248309
BsmtFinSF2 37 1460 4.654932e+01 1.613193e+02 0.0 1.382705e+00 0.0000 0 1474 1474 4.2465214 20.0088641 4.2219183
BsmtUnfSF 38 1460 5.672404e+02 4.418670e+02 477.5 5.192885e+02 426.9888 0 2336 2336 0.9183784 0.4645113 11.5641868
TotalBsmtSF 39 1460 1.057429e+03 4.387053e+02 991.5 1.036695e+03 347.6697 0 6110 6110 1.5211239 13.1788560 11.4814431
Heating* 40 1460 2.036301e+00 2.951238e-01 2.0 2.000000e+00 0.0000 1 6 5 9.8348595 110.9795650 0.0077237
HeatingQC* 41 1460 2.538356e+00 1.739524e+00 1.0 2.422945e+00 0.0000 1 5 4 0.4822257 -1.5124682 0.0455254
CentralAir* 42 1460 1.934932e+00 2.467312e-01 2.0 2.000000e+00 0.0000 1 2 1 -3.5231347 10.4196162 0.0064573
Electrical* 43 1459 4.681974e+00 1.051629e+00 5.0 5.000000e+00 0.0000 1 5 4 -3.0554792 7.4856348 0.0275318
X1stFlrSF 44 1460 1.162627e+03 3.865877e+02 1087.0 1.129991e+03 347.6697 334 4692 4358 1.3739290 5.7101321 10.1174635
X2ndFlrSF 45 1460 3.469925e+02 4.365284e+02 0.0 2.853639e+02 0.0000 0 2065 2065 0.8113600 -0.5590240 11.4244713
LowQualFinSF 46 1460 5.844520e+00 4.862308e+01 0.0 0.000000e+00 0.0000 0 572 572 8.9928333 82.8282385 1.2725242
GrLivArea 47 1460 1.515464e+03 5.254804e+02 1464.0 1.467670e+03 483.3276 334 5642 5308 1.3637536 4.8634828 13.7524502
BsmtFullBath 48 1460 4.253425e-01 5.189106e-01 0.0 3.921233e-01 0.0000 0 3 3 0.5948424 -0.8432916 0.0135805
BsmtHalfBath 49 1460 5.753420e-02 2.387526e-01 0.0 0.000000e+00 0.0000 0 2 2 4.0949749 16.3099569 0.0062484
FullBath 50 1460 1.565068e+00 5.509158e-01 2.0 1.560788e+00 0.0000 0 3 3 0.0364865 -0.8611503 0.0144181
HalfBath 51 1460 3.828767e-01 5.028854e-01 0.0 3.433219e-01 0.0000 0 2 2 0.6745093 -1.0799824 0.0131611
BedroomAbvGr 52 1460 2.866438e+00 8.157780e-01 3.0 2.852740e+00 0.0000 0 8 8 0.2113551 2.2119881 0.0213499
KitchenAbvGr 53 1460 1.046575e+00 2.203382e-01 1.0 1.000000e+00 0.0000 0 3 3 4.4791783 21.4211386 0.0057665
KitchenQual* 54 1460 3.339726e+00 8.301613e-01 4.0 3.504281e+00 0.0000 1 4 3 -1.4198855 1.7156248 0.0217263
TotRmsAbvGrd 55 1460 6.517808e+00 1.625393e+00 6.0 6.408390e+00 1.4826 2 14 12 0.6749517 0.8683368 0.0425385
Functional* 56 1460 6.749315e+00 9.796592e-01 7.0 7.000000e+00 0.0000 1 7 6 -4.0765682 16.3748321 0.0256389
Fireplaces 57 1460 6.130137e-01 6.446664e-01 1.0 5.342466e-01 1.4826 0 3 3 0.6482311 -0.2244068 0.0168717
FireplaceQu* 58 770 3.733766e+00 1.132578e+00 3.0 3.798701e+00 1.4826 1 5 4 -0.1578800 -0.9845157 0.0408153
GarageType* 59 1379 3.279188e+00 1.785588e+00 2.0 3.105882e+00 0.0000 1 6 5 0.7618616 -1.3020231 0.0480838
GarageYrBlt 60 1379 1.978506e+03 2.468972e+01 1980.0 1.981065e+03 31.1346 1900 2010 110 -0.6480025 -0.4249123 0.6648660
GarageFinish* 61 1379 2.183466e+00 8.128963e-01 2.0 2.228959e+00 1.4826 1 3 2 -0.3465419 -1.4058467 0.0218904
GarageCars 62 1460 1.767123e+00 7.473150e-01 2.0 1.773973e+00 0.0000 0 4 4 -0.3418454 0.2117307 0.0195581
GarageArea 63 1460 4.729801e+02 2.138048e+02 480.0 4.698082e+02 177.9120 0 1418 1418 0.1796113 0.9044687 5.5955284
GarageQual* 64 1379 4.864395e+00 6.105280e-01 5.0 5.000000e+00 0.0000 1 5 4 -4.4312099 18.2507961 0.0164408
GarageCond* 65 1379 4.899928e+00 5.224912e-01 5.0 5.000000e+00 0.0000 1 5 4 -5.2754778 26.7731497 0.0140701
PavedDrive* 66 1460 2.856164e+00 4.965919e-01 3.0 3.000000e+00 0.0000 1 3 2 -3.3021422 9.2205389 0.0129964
WoodDeckSF 67 1460 9.424452e+01 1.253388e+02 0.0 7.175771e+01 0.0000 0 857 857 1.5382100 2.9704171 3.2802662
OpenPorchSF 68 1460 4.666027e+01 6.625603e+01 25.0 3.323288e+01 37.0650 0 547 547 2.3594857 8.4414910 1.7339995
EnclosedPorch 69 1460 2.195411e+01 6.111915e+01 0.0 3.866438e+00 0.0000 0 552 552 3.0835258 10.3726341 1.5995612
X3SsnPorch 70 1460 3.409589e+00 2.931733e+01 0.0 0.000000e+00 0.0000 0 508 508 10.2831784 123.0623116 0.7672696
ScreenPorch 71 1460 1.506096e+01 5.575742e+01 0.0 0.000000e+00 0.0000 0 480 480 4.1137473 18.3426076 1.4592383
PoolArea 72 1460 2.758904e+00 4.017731e+01 0.0 0.000000e+00 0.0000 0 738 738 14.7979183 222.1917078 1.0514882
PoolQC* 73 7 2.142857e+00 8.997354e-01 2.0 2.142857e+00 1.4826 1 3 2 -0.2161500 -1.9030436 0.3400680
Fence* 74 281 2.427046e+00 8.634533e-01 3.0 2.484444e+00 0.0000 1 4 3 -0.5712072 -0.8825197 0.0515093
MiscFeature* 75 54 2.907407e+00 4.458834e-01 3.0 3.000000e+00 0.0000 1 4 3 -2.9309302 10.7075632 0.0606770
MiscVal 76 1460 4.348904e+01 4.961230e+02 0.0 0.000000e+00 0.0000 0 15500 15500 24.4265224 697.6400721 12.9841330
MoSold 77 1460 6.321918e+00 2.703626e+00 6.0 6.252568e+00 2.9652 1 12 11 0.2116175 -0.4103846 0.0707571
YrSold 78 1460 2.007816e+03 1.328095e+00 2008.0 2.007770e+03 1.4826 2006 2010 4 0.0960708 -1.1931116 0.0347578
SaleType* 79 1460 8.509589e+00 1.560932e+00 9.0 8.922089e+00 0.0000 1 9 8 -3.8316025 14.5748669 0.0408515
SaleCondition* 80 1460 4.770548e+00 1.100854e+00 5.0 5.000000e+00 0.0000 1 6 5 -2.7355374 6.8235962 0.0288107
SalePrice 81 1460 1.809212e+05 7.944250e+04 163000.0 1.707833e+05 56338.8000 34900 755000 720100 1.8790086 6.4967893 2079.1053240



I limited the variables to those that were numeric. When plotting their distributions, I noticed many are right-skewed, suggesting that they may need a transform to approach normal.

# Select numeric variables and gather
gathered <- select_if(train, is.numeric) %>% gather()

# Plot distributions of variables
distribution_plot <- gathered %>%
  ggplot(aes(value)) +
  facet_wrap( ~ key, scales = "free") +
  ggtitle("Housing Dataset Density Plots (38 Numeric Variables)") +
  geom_density() +
  theme(
    strip.text = element_text(size = 8),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    axis.text = element_blank()
  )

distribution_plot




b | Scatterplot matrix

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

First Floor Square Feet and Overall Quality Rating appear to have a positive relationship to Sale Price, which makes intuitive sense. A home with more square feet and a higher quality rating can demand a higher price.

to_matrix <- data.matrix(dplyr::select(train, SalePrice, X1stFlrSF, OverallQual))

pairs(~SalePrice + X1stFlrSF + OverallQual, data = to_matrix)




c | Correlation matrix

Derive a correlation matrix for any THREE quantitative variables in the dataset.

This correlation matrix evaluates the strength of the relationship between Sales Price and three variables: 1st Floor Square Feet, Basement Square Feet, and Overall Quality Rating. Of the three variables, Overall Quality is most strongly correlated with Sales Price.

to_corr <- dplyr::select(train, X1stFlrSF, TotalBsmtSF, OverallQual, SalePrice)

corr <- cor(to_corr, method = "pearson", use = "complete.obs")

corr
##             X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF   1.0000000   0.8195300   0.4762238 0.6058522
## TotalBsmtSF 0.8195300   1.0000000   0.5378085 0.6135806
## OverallQual 0.4762238   0.5378085   1.0000000 0.7909816
## SalePrice   0.6058522   0.6135806   0.7909816 1.0000000
corrplot(corr, type = "upper", addCoef.col = "white", tl.srt=45, tl.col = "black")




d | Hypothesis testing

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval.

Hypotheses to test:

  • Null: the true correlation between each variable and Sale Price is equal to zero.

  • Alternate: the true correlation is not equal to zero.


First Floor Square Feet

We are 80% confident that this variable’s true correlation to Sale Price is between 0.58 and 0.63. Since the p-value is less than 0.2, we can reject the null hypothesis and accept the alternate as true.

FirstFloorSqFt <- as.matrix(dplyr::select(train, X1stFlrSF))
SalePrice <- as.matrix(dplyr::select(train, SalePrice))

cor.test(FirstFloorSqFt, SalePrice, conf.level=0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  FirstFloorSqFt and SalePrice
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5841687 0.6266715
## sample estimates:
##       cor 
## 0.6058522


Basement Square Feet

We are 80% confident that this variable’s true correlation to Sale Price is between 0.59 and 0.63. Since the p-value is less than 0.2, we can reject the null hypothesis and accept the alternate as true.

BasementSqFt <- as.matrix(dplyr::select(train, TotalBsmtSF))
SalePrice <- as.matrix(dplyr::select(train, SalePrice))

cor.test(BasementSqFt, SalePrice, conf.level=0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  BasementSqFt and SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806


Overall Quality

We are 80% confident that this variable’s true correlation to Sale Price is between 0.78 and 0.80. Since the p-value is less than 0.2, we can reject the null hypothesis and accept the alternate as true.

OverallQuality <- as.matrix(dplyr::select(train, OverallQual))
SalePrice <- as.matrix(dplyr::select(train, SalePrice))

cor.test(OverallQuality, SalePrice, conf.level=0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  OverallQuality and SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7780752 0.8032204
## sample estimates:
##       cor 
## 0.7909816



e | Analysis

Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Familywise error refers to the likelihood of encountering a Type I error – a false positive – in a series of hypothesis tests. The probability of this seems low, since the p-value of each of our correlations above are so small.




B. Linear Algebra and Correlation

a | Invert the correlation matrix

# Original correlation matrix
corr
##             X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF   1.0000000   0.8195300   0.4762238 0.6058522
## TotalBsmtSF 0.8195300   1.0000000   0.5378085 0.6135806
## OverallQual 0.4762238   0.5378085   1.0000000 0.7909816
## SalePrice   0.6058522   0.6135806   0.7909816 1.0000000
# Inverse of correlation matrix
corr_inverse <- solve(corr)
corr_inverse
##              X1stFlrSF TotalBsmtSF OverallQual  SalePrice
## X1stFlrSF    3.2585965  -2.3703774   0.3583515 -0.8032598
## TotalBsmtSF -2.3703774   3.3472154  -0.4881843 -0.2315431
## OverallQual  0.3583515  -0.4881843   2.7426122 -2.0869234
## SalePrice   -0.8032598  -0.2315431  -2.0869234  3.2794450



b | Check precision and correlation matrices

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

First, corr times corr_inverse:

a <- corr %*% corr_inverse
round(a)
##             X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF           1           0           0         0
## TotalBsmtSF         0           1           0         0
## OverallQual         0           0           1         0
## SalePrice           0           0           0         1


Then, corr_inverse times corr:

b <- corr_inverse %*% corr
round(b)
##             X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF           1           0           0         0
## TotalBsmtSF         0           1           0         0
## OverallQual         0           0           1         0
## SalePrice           0           0           0         1


Both calculations result in 1’s along the diagonal and values very close to zero in every other position – the identity matrix.




c | LU decomposition

lu_corr <- lu.decomposition(corr)
lu_corr
## $L
##           [,1]      [,2]      [,3] [,4]
## [1,] 1.0000000 0.0000000 0.0000000    0
## [2,] 0.8195300 1.0000000 0.0000000    0
## [3,] 0.4762238 0.4492753 1.0000000    0
## [4,] 0.6058522 0.3565073 0.6363648    1
## 
## $U
##      [,1]      [,2]      [,3]      [,4]
## [1,]    1 0.8195300 0.4762238 0.6058522
## [2,]    0 0.3283706 0.1475288 0.1170665
## [3,]    0 0.0000000 0.7069298 0.4498653
## [4,]    0 0.0000000 0.0000000 0.3049296
# Check that the decomposition results in the original correlation matrix
lu_check <- lu_corr$L %*% lu_corr$U
corr == lu_check
##             X1stFlrSF TotalBsmtSF OverallQual SalePrice
## X1stFlrSF        TRUE        TRUE        TRUE      TRUE
## TotalBsmtSF      TRUE        TRUE        TRUE      TRUE
## OverallQual      TRUE        TRUE        TRUE      TRUE
## SalePrice        TRUE        TRUE        TRUE      TRUE



C. Calculus-Based Probability and Statistics

a | Select and shift variable

Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

I chose the GrLivArea variable – the square feet of above grade (ground) living area. Before transformation, the variable was skewed to the right. After centering the variable at zero and taking the absolute value, the variable more closely resembles an exponential distribution.

# Original variable
hist(train$GrLivArea, breaks=50, freq = FALSE, main = "Histogram of LivingArea")

# Tranform variable
LivingArea <- abs(scale(train$GrLivArea))

hist(LivingArea, breaks=50, freq = FALSE)




b | Fit Exponential PDF

Then load the MASS package and run fitdistr to fit an exponential probability density function. Take 1000 samples from this exponential distribution using this value. Plot a histogram and compare it with a histogram of your original variable.

When fitted to an exponential distribution, the estimated rate of change \(\lambda\) is about \(1.32\).

f <- fitdistr(LivingArea, densfun = "exponential")
rate_est <- f$estimate
rate_est
##     rate 
## 1.322546


We can take 1,000 samples of an exponential function using the rate of change.

distribution <- rexp(1000, rate_est)

hist(distribution, freq = FALSE, breaks=50, main = "Exponential PDF Model")
curve(dexp(x, rate = rate_est), col = "red", add = TRUE)


We see the exponential PDF model fits the transformed Living Area variable pretty well.

hist(LivingArea, breaks=50, freq = FALSE, main = "Exponential PDF Model Fitted to Living Area")
curve(dexp(x, rate = rate_est), col = "red", add = TRUE)




c | Find percentiles and CI

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

qexp(0.05, rate = rate_est)
## [1] 0.03878376
qexp(0.95, rate = rate_est)
## [1] 2.265126



d | Empirical percentiles

Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

LivingArea_norm <- log(train$GrLivArea)
hist(LivingArea_norm, breaks=50, freq = FALSE)

qnorm(0.05)
## [1] -1.644854
qnorm(0.95)
## [1] 1.644854



D. Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

First, I restricted the data to numeric variables. Then, as a baseline, a created a model with all of the variables, and examined the residuals.

The complete-variable model results in an Adjusted R-Squared of 0.8036, which is pretty good. However, since we’re using a lot of variables, there’s a chance the data is being overfit.

train_model <- select_if(train, is.numeric)

model <- lm(data=train_model, SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold)

summary(model)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + 
##     BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + 
##     X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + 
##     FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + 
##     OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch + 
##     PoolArea + MiscVal + MoSold + YrSold, data = train_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442865  -16873   -2581   14998  318042 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.232e+05  1.701e+06  -0.190 0.849317    
## MSSubClass    -2.005e+02  3.449e+01  -5.814 8.03e-09 ***
## LotFrontage   -1.161e+02  6.124e+01  -1.896 0.058203 .  
## LotArea        5.454e-01  1.573e-01   3.466 0.000548 ***
## OverallQual    1.870e+04  1.478e+03  12.646  < 2e-16 ***
## OverallCond    5.227e+03  1.367e+03   3.824 0.000139 ***
## YearBuilt      3.170e+02  8.762e+01   3.617 0.000311 ***
## YearRemodAdd   1.206e+02  8.661e+01   1.392 0.164174    
## MasVnrArea     3.160e+01  7.006e+00   4.511 7.15e-06 ***
## BsmtFinSF1     1.739e+01  5.835e+00   2.980 0.002947 ** 
## BsmtFinSF2     8.362e+00  8.763e+00   0.954 0.340205    
## BsmtUnfSF      5.006e+00  5.275e+00   0.949 0.342890    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.591e+01  7.356e+00   6.241 6.21e-10 ***
## X2ndFlrSF      4.668e+01  6.099e+00   7.654 4.28e-14 ***
## LowQualFinSF   3.415e+01  2.788e+01   1.225 0.220788    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   8.980e+03  3.194e+03   2.812 0.005018 ** 
## BsmtHalfBath   2.490e+03  5.071e+03   0.491 0.623487    
## FullBath       5.390e+03  3.529e+03   1.527 0.126941    
## HalfBath      -1.119e+03  3.320e+03  -0.337 0.736244    
## BedroomAbvGr  -1.023e+04  2.154e+03  -4.750 2.30e-06 ***
## KitchenAbvGr  -2.193e+04  6.704e+03  -3.271 0.001105 ** 
## TotRmsAbvGrd   5.440e+03  1.486e+03   3.661 0.000263 ***
## Fireplaces     4.375e+03  2.188e+03   2.000 0.045793 *  
## GarageYrBlt   -4.914e+01  9.093e+01  -0.540 0.589011    
## GarageCars     1.679e+04  3.487e+03   4.815 1.68e-06 ***
## GarageArea     6.488e+00  1.211e+01   0.536 0.592338    
## WoodDeckSF     2.155e+01  1.002e+01   2.151 0.031713 *  
## OpenPorchSF   -2.315e+00  1.948e+01  -0.119 0.905404    
## EnclosedPorch  7.233e+00  2.061e+01   0.351 0.725733    
## X3SsnPorch     3.458e+01  3.749e+01   0.922 0.356593    
## ScreenPorch    5.797e+01  2.040e+01   2.842 0.004572 ** 
## PoolArea      -6.126e+01  2.984e+01  -2.053 0.040326 *  
## MiscVal       -3.850e+00  6.955e+00  -0.554 0.579980    
## MoSold        -2.240e+02  4.227e+02  -0.530 0.596213    
## YrSold        -2.536e+02  8.454e+02  -0.300 0.764216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36790 on 1086 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8095, Adjusted R-squared:  0.8036 
## F-statistic: 135.7 on 34 and 1086 DF,  p-value: < 2.2e-16


When we look at plots of the complete-variable model, we see that there are outliers that are influencing the results, but on the whole the residuals are close to normal. However, I’ll if I can improve it by performing backwards-substitution.

par(mfrow=c(2,2))
plot(model)


After a few rounds of removing variables with low p-values, I’m left with 12 variables and an Adjusted R-Squared of 0.7717. That means that these 12 variables explain about 77% of the variability in Sale Price.

model_1 <- lm(data=train_model, SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageCars + ScreenPorch)

summary(model_1)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + MasVnrArea + BsmtFullBath + BedroomAbvGr + 
##     KitchenAbvGr + TotRmsAbvGrd + GarageCars + ScreenPorch, data = train_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -321953  -20460   -3078   15431  390991 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.464e+05  9.482e+04  -7.872 6.82e-15 ***
## MSSubClass   -1.536e+02  2.512e+01  -6.115 1.24e-09 ***
## LotArea       8.078e-01  1.048e-01   7.711 2.32e-14 ***
## OverallQual   2.489e+04  1.132e+03  21.977  < 2e-16 ***
## OverallCond   4.570e+03  9.929e+02   4.603 4.52e-06 ***
## YearBuilt     3.317e+02  4.797e+01   6.916 6.98e-12 ***
## MasVnrArea    4.979e+01  6.190e+00   8.043 1.81e-15 ***
## BsmtFullBath  1.747e+04  2.007e+03   8.705  < 2e-16 ***
## BedroomAbvGr -9.825e+03  1.752e+03  -5.609 2.44e-08 ***
## KitchenAbvGr -1.779e+04  5.284e+03  -3.368 0.000778 ***
## TotRmsAbvGrd  1.605e+04  1.046e+03  15.355  < 2e-16 ***
## GarageCars    1.406e+04  1.823e+03   7.713 2.28e-14 ***
## ScreenPorch   6.545e+01  1.802e+01   3.632 0.000291 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37880 on 1439 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7736, Adjusted R-squared:  0.7717 
## F-statistic: 409.7 on 12 and 1439 DF,  p-value: < 2.2e-16


The new model appears to be closer to normal than the prior one, although there are still outliers influencing the results.

par(mfrow=c(2,2))
plot(model_1)


Now, I can read in the test dataset, run a prediction using the new model, and post to Kaggle.

link <- "C:/Users/Kavya/Desktop/Kavya/msds/DATA 605 Computational Mathematics/Final Project/test.csv"

test_data <- read.csv(link, header=TRUE, sep=",")

test_data$SalePrice <- predict(model_1, newdata = test_data, type = "response")

headers <- c("Id", "SalePrice")

test_export <- test_data[headers]

write.csv(test_export, file="kaggle_submission.csv", row.names=FALSE, na="0")

Kaggle Username: ksb357

Kaggle Score: 1.20439