library(tidyverse)
library(GGally)
library(confintr)
library(MASS)
library(matlib)
library(matrixcalc)
library(Hmisc)
train <- as.data.frame(read.csv('hp_train.csv'))
Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set.
#Remove Columns with NA values
train <- train[,colSums(is.na(train)) == 0]
head(train[,2:ncol(train)])
## MSSubClass MSZoning LotArea Street LotShape LandContour Utilities LotConfig
## 1 60 RL 8450 Pave Reg Lvl AllPub Inside
## 2 20 RL 9600 Pave Reg Lvl AllPub FR2
## 3 60 RL 11250 Pave IR1 Lvl AllPub Inside
## 4 70 RL 9550 Pave IR1 Lvl AllPub Corner
## 5 60 RL 14260 Pave IR1 Lvl AllPub FR2
## 6 50 RL 14115 Pave IR1 Lvl AllPub Inside
## LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual
## 1 Gtl CollgCr Norm Norm 1Fam 2Story 7
## 2 Gtl Veenker Feedr Norm 1Fam 1Story 6
## 3 Gtl CollgCr Norm Norm 1Fam 2Story 7
## 4 Gtl Crawfor Norm Norm 1Fam 2Story 7
## 5 Gtl NoRidge Norm Norm 1Fam 2Story 8
## 6 Gtl Mitchel Norm Norm 1Fam 1.5Fin 5
## OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 1 5 2003 2003 Gable CompShg VinylSd VinylSd
## 2 8 1976 1976 Gable CompShg MetalSd MetalSd
## 3 5 2001 2002 Gable CompShg VinylSd VinylSd
## 4 5 1915 1970 Gable CompShg Wd Sdng Wd Shng
## 5 5 2000 2000 Gable CompShg VinylSd VinylSd
## 6 5 1993 1995 Gable CompShg VinylSd VinylSd
## ExterQual ExterCond Foundation BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Gd TA PConc 706 0 150 856
## 2 TA TA CBlock 978 0 284 1262
## 3 Gd TA PConc 486 0 434 920
## 4 TA TA BrkTil 216 0 540 756
## 5 Gd TA PConc 655 0 490 1145
## 6 TA TA Wood 732 0 64 796
## Heating HeatingQC CentralAir X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## 1 GasA Ex Y 856 854 0 1710
## 2 GasA Ex Y 1262 0 0 1262
## 3 GasA Ex Y 920 866 0 1786
## 4 GasA Gd Y 961 756 0 1717
## 5 GasA Ex Y 1145 1053 0 2198
## 6 GasA Ex Y 796 566 0 1362
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## 1 1 0 2 1 3 1
## 2 0 1 2 0 3 1
## 3 1 0 2 1 3 1
## 4 1 0 1 0 3 1
## 5 1 0 2 1 4 1
## 6 1 0 1 1 1 1
## KitchenQual TotRmsAbvGrd Functional Fireplaces GarageCars GarageArea
## 1 Gd 8 Typ 0 2 548
## 2 TA 6 Typ 1 2 460
## 3 Gd 6 Typ 1 2 608
## 4 Gd 7 Typ 1 3 642
## 5 Gd 9 Typ 1 3 836
## 6 TA 5 Typ 0 2 480
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 1 Y 0 61 0 0 0
## 2 Y 298 0 0 0 0
## 3 Y 0 42 0 0 0
## 4 Y 0 35 272 0 0
## 5 Y 192 84 0 0 0
## 6 Y 40 30 0 320 0
## PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 0 2 2008 WD Normal 208500
## 2 0 0 5 2007 WD Normal 181500
## 3 0 0 9 2008 WD Normal 223500
## 4 0 0 2 2006 WD Abnorml 140000
## 5 0 0 12 2008 WD Normal 250000
## 6 0 700 10 2009 WD Normal 143000
# Data type of variables
type <- data.frame(Type = table(sapply(train, class)))
ggplot(type, aes(x = Type.Var1, y = Type.Freq)) +
geom_bar(stat="identity", width = 0.25, color = 'black', fill='seagreen') +
labs(x = 'Type', y = 'Count')
sp <- ggplot(train,aes(SalePrice)) + geom_histogram(aes(y =..density..),bins = 50, fill = 'LightGreen', color = 'Black') + geom_density()
sp
Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
sub <- train %>% dplyr::select(SalePrice, PoolArea, LotArea)
ggpairs(sub)
Derive a correlation matrix for any three quantitative variables in the
dataset.
sub2 <- train %>% dplyr::select(SalePrice, LotArea, OverallCond)
cm <- cor(sub2)
cm
## SalePrice LotArea OverallCond
## SalePrice 1.00000000 0.26384335 -0.07785589
## LotArea 0.26384335 1.00000000 -0.00563627
## OverallCond -0.07785589 -0.00563627 1.00000000
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points
sl <- cm[1,][2]
so <- cm[1,][3]
lo <- cm[2,][3]
n <- nrow(train)
ci <- c(0.1,0.9)
ci_1 <- ci_cor(sub2 %>% dplyr::select(c(1,2)), ci)
ci_2 <- ci_cor(sub2 %>% dplyr::select(c(1,3)), ci)
ci_3 <- ci_cor(sub2 %>% dplyr::select(c(2,3)), ci)
print('Sale Price v. Lot Area: ')
## [1] "Sale Price v. Lot Area: "
print(ci_1$interval)
## [1] 0.2154574 0.3109369
print('Sale Price v. Overall Condition: ')
## [1] "Sale Price v. Overall Condition: "
print(ci_2$interval)
## [1] -0.12864437 -0.02666008
print('Lot Area v. Overall Condition: ')
## [1] "Lot Area v. Overall Condition: "
print(ci_3$interval)
## [1] -0.05692212 0.04567924
Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points
lu_factor <- function(x)
{
counter = 2
pos = 1
identity = seq(1,nrow(x)*ncol(x))
for(r in seq(nrow(x)))
{
for(c in seq(ncol(x)))
{
if(r==c)
{identity[pos] = 1}
else
{identity[pos] = 0}
pos = pos + 1
}
}
identity = as.matrix(matrix(identity, nrow = nrow(x), ncol=ncol(x), byrow = TRUE))
for (c in seq(1,ncol(x)-1)) {
for( r in seq(counter, nrow(x)))
{
factor = -(x[r,c]/x[counter-1,c])
new_r = factor*x[counter-1,]
x[r,] = new_r + x[r,]
identity[r,c] = -1*factor
}
counter = counter + 1
}
list(x, identity)
}
# Invert your correlation matrix from above.
pm <- inv(cm)
temp <- cm %*% pm
round(temp,1)
##
## SalePrice 1 0 0
## LotArea 0 1 0
## OverallCond 0 0 1
m <- temp %*% cm
m
## SalePrice LotArea OverallCond
## SalePrice 1.00000000 0.263843349 -0.077855893
## LotArea 0.26384335 0.999999999 -0.005636267
## OverallCond -0.07785589 -0.005636267 0.999999997
#Conduct LU decomposition on the matrix
print(lu_factor(m))
## [[1]]
## SalePrice LotArea OverallCond
## SalePrice 1 0.2638433 -0.07785589
## LotArea 0 0.9303867 0.01490549
## OverallCond 0 0.0000000 0.99369966
##
## [[2]]
## [,1] [,2] [,3]
## [1,] 1.00000000 0.00000000 0
## [2,] 0.26384335 1.00000000 0
## [3,] -0.07785589 0.01602075 1
lu.decomposition(m)
## $L
## [,1] [,2] [,3]
## [1,] 1.00000000 0.00000000 0
## [2,] 0.26384335 1.00000000 0
## [3,] -0.07785589 0.01602075 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 0.2638433 -0.07785589
## [2,] 0 0.9303867 0.01490549
## [3,] 0 0.0000000 0.99369966
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
train_num <- select_if(train, is.numeric)
hist.data.frame(train_num)
ggplot(train_num, aes(BsmtFinSF1)) + geom_histogram(bins = 50, color = 'black', fill = 'lightpink')
Then load the MASS package and run fitdistr to fit an exponential probability density function. (See fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/Rdevel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.
data <- train_num$BsmtFinSF1
fit <- fitdistr(data,"exponential")
lambda <- fit$estimate
samples <- rexp(1000, lambda)
samples <- as.data.frame(samples)
ggplot(samples, aes(samples)) + geom_histogram(bins = 50, color = 'black', fill = 'lightblue')
Using the exponential pdf, find the 5th and 95th percentiles using the
cumulative distribution function (CDF).
qexp(0.05, lambda)
## [1] 22.75574
qexp(0.95, lambda)
## [1] 1329.026
qexp(c(0.05,0.95), lambda)
## [1] 22.75574 1329.02585
Also generate a 95% confidence interval from the empirical data, assuming normality.
m <- mean(data)
sd <- sd(data)
n <- nrow(train_num)
se <- qnorm(0.975)*(sd/sqrt(n))
c(m-se,m+se)
## [1] 420.2444 467.0351
Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points
quantile(train_num$GrLivArea, probs = c(0.05,0.95))
## 5% 95%
## 848.0 2466.1
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com username and score. 10 points
test <- as.data.frame(read.csv('hp_test.csv'))
unique(is.na(test$LotArea))
## [1] FALSE
mrm <- lm(SalePrice ~ LotArea + YearBuilt + HouseStyle + OverallCond + Neighborhood, data = train )
mrm
##
## Call:
## lm(formula = SalePrice ~ LotArea + YearBuilt + HouseStyle + OverallCond +
## Neighborhood, data = train)
##
## Coefficients:
## (Intercept) LotArea YearBuilt
## -1.997e+06 1.566e+00 1.077e+03
## HouseStyle1.5Unf HouseStyle1Story HouseStyle2.5Fin
## -2.470e+04 -1.595e+04 8.696e+04
## HouseStyle2.5Unf HouseStyle2Story HouseStyleSFoyer
## 3.883e+04 6.567e+03 -2.754e+04
## HouseStyleSLvl OverallCond NeighborhoodBlueste
## -1.218e+04 8.641e+03 -5.858e+04
## NeighborhoodBrDale NeighborhoodBrkSide NeighborhoodClearCr
## -7.778e+04 -1.584e+04 1.871e+02
## NeighborhoodCollgCr NeighborhoodCrawfor NeighborhoodEdwards
## -8.605e+03 4.287e+04 -3.439e+04
## NeighborhoodGilbert NeighborhoodIDOTRR NeighborhoodMeadowV
## -2.572e+04 -3.452e+04 -7.076e+04
## NeighborhoodMitchel NeighborhoodNAmes NeighborhoodNoRidge
## -3.125e+04 -2.093e+04 1.132e+05
## NeighborhoodNPkVill NeighborhoodNridgHt NeighborhoodNWAmes
## -3.828e+04 1.016e+05 -4.682e+03
## NeighborhoodOldTown NeighborhoodSawyer NeighborhoodSawyerW
## -1.448e+04 -3.330e+04 -1.103e+04
## NeighborhoodSomerst NeighborhoodStoneBr NeighborhoodSWISU
## 1.069e+04 1.052e+05 -1.192e+04
## NeighborhoodTimber NeighborhoodVeenker
## 2.128e+04 3.569e+04
predict <- predict(mrm, test)
predict_csv <- data.frame(Id = test$Id, SalePrice = predict)
head(predict_csv)
## Id SalePrice
## 1 1461 147826.1
## 2 1462 148737.3
## 3 1463 199128.1
## 4 1464 202814.8
## 5 1465 288344.6
## 6 1466 188823.9
write.csv(predict_csv, 'Submission.csv',row.names = FALSE)