Libraries

suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(RCurl)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(caret)))
suppressWarnings(suppressMessages(library(tables)))
suppressWarnings(suppressMessages(library(reshape2)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(MASS)))

Reading Data

test_data <- read.csv("test.csv")
train_data <- read.csv("train.csv")
#head(test_data)
#head(train_data)

Picking the independent and dependent variable

# The quantitative independent variable picked up here is 'TotalBsmtSF'( First Floor square feet)
X <- train_data$TotalBsmtSF

# Dependent variable selected is sale price for house.
Y <- train_data$SalePrice

# Ploting the independent variable to check the variable is right skewed or left skewed.

par(oma=c(3,3,0,0),mar=c(3,3,2,2),mfrow=c(2,1))
hist(X,breaks=100, main='Basement square feet')

summary(train_data$TotalBsmtSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   795.8   991.5  1057.0  1298.0  6110.0

The graph shows that variable is skewed towards right with mean greater than the median.

Probability

Calculate the probabilities:

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
  1. P(X > x | Y > y)
  2. P(X > x | Y > y)
  3. P(X < x | Y > y)

Conditional Probability :

p(x|y) = p(x,y)/p(y)

x <- quantile(train_data$TotalBsmtSF,0.75)
x
##     75% 
## 1298.25
y <- quantile(train_data$SalePrice,0.50)
y
##    50% 
## 163000

“x” is 3rd quantile of X variable. “y” is 2nd quantile of Y variable.

  1. P(X > x| Y > y)
(filter(train_data,SalePrice > y & TotalBsmtSF > x)  %>%  tally()/nrow(train_data))/(filter(train_data, SalePrice > y) %>% tally()/nrow(train_data))
##           n
## 1 0.4519231
  1. P(X > x|Y > y)
(filter(train_data, SalePrice > x & TotalBsmtSF > x) %>% tally()/nrow(train_data)) *(filter(train_data, SalePrice > y) %>% tally()/nrow(train_data))
##           n
## 1 0.1246575
  1. P(X < x| Y > y)
(filter(train_data, SalePrice > y & TotalBsmtSF < x) %>% tally()/nrow(train_data))/
(filter(train_data, SalePrice > y) %>% tally()/nrow(train_data))
##           n
## 1 0.5480769

Does splitting the training data in this fashion make them independent?

Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

P(A) = 365/1460 = 0.25 P(B) = 728/1460 = 0.4986

P(A)P(B) = 0.25 * 0.4986 = 0.12465

We know that P(A|B) = 0.451923;

P(A|B)! = P(A)P(B)

hence X ad Y are not independent.Spliting of training data in this fasion does not make them independent. It is infered from the data that Basement area can affect the sales price.

Chi Square test

chi-squared test (χ2) is a statistical test applied to sets of categorical data to evaluate how likely it is that any observed difference between the sets arose by chance.

Chisq <- chisq.test((table(train_data$TotalBsmtSF,train_data$SalePrice)))
## Warning in chisq.test((table(train_data$TotalBsmtSF, train_data
## $SalePrice))): Chi-squared approximation may be incorrect
Chisq
## 
##  Pearson's Chi-squared test
## 
## data:  (table(train_data$TotalBsmtSF, train_data$SalePrice))
## X-squared = 509710, df = 476640, p-value < 2.2e-16

Aswe notice here, p-value is significantly less than 0.05,we surely reject NULL hypothesis.Conclusion drawn - X is not independent of Y.

Descriptive and Inference Statistics:

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

#summary(train_data)
#str(train_data)
train_data$MSZoning <- as.factor(train_data$MSZoning)
plot(train_data[,c(20:30)])

plot(train_data$MSZoning,train_data$SalePrice)

barplot(train_data$GrLivArea,train_data$SalePrice)

plot(train_data$YearBuilt,train_data$SalePrice)

Scatter Plot between X and Y
plot(train_data$TotalBsmtSF,train_data$SalePrice,main="Total Basement area and Sales Price",xlab = "Total Basement Area[X]",ylab = "Sales Price[Y]")
abline(lm(train_data$SalePrice ~ train_data$TotalBsmtSF),col = 'red')

numeric_train <- select_if(train_data,is.numeric)
names(numeric_train)
##  [1] "Id"            "MSSubClass"    "LotFrontage"   "LotArea"      
##  [5] "OverallQual"   "OverallCond"   "YearBuilt"     "YearRemodAdd" 
##  [9] "MasVnrArea"    "BsmtFinSF1"    "BsmtFinSF2"    "BsmtUnfSF"    
## [13] "TotalBsmtSF"   "X1stFlrSF"     "X2ndFlrSF"     "LowQualFinSF" 
## [17] "GrLivArea"     "BsmtFullBath"  "BsmtHalfBath"  "FullBath"     
## [21] "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr"  "TotRmsAbvGrd" 
## [25] "Fireplaces"    "GarageYrBlt"   "GarageCars"    "GarageArea"   
## [29] "WoodDeckSF"    "OpenPorchSF"   "EnclosedPorch" "X3SsnPorch"   
## [33] "ScreenPorch"   "PoolArea"      "MiscVal"       "MoSold"       
## [37] "YrSold"        "SalePrice"

Correlation Matrix

Three variables

variables = c("GrLivArea","GarageArea","SalePrice")
correlation = cor(numeric_train[variables])
correlation
##            GrLivArea GarageArea SalePrice
## GrLivArea  1.0000000  0.4689975 0.7086245
## GarageArea 0.4689975  1.0000000 0.6234314
## SalePrice  0.7086245  0.6234314 1.0000000
library(corrplot)
## corrplot 0.84 loaded
corrplot(correlation,method = "ellipse")

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

To test the correlations we will use correlation t-test.

  1. Correlation between GrLivArea and SalePrice
cor.test(numeric_train$GrLivArea,numeric_train$SalePrice,method = "pearson",conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_train$GrLivArea and numeric_train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.6850407 0.7307245
## sample estimates:
##       cor 
## 0.7086245

As statstic shows ,The correlation between Sale Price and Above grade (ground) living area square feet is not 0, therefore we reject null hypothesis.The correlation coeficient is 0.70. The 92 % confidence interval is between 68 % to 73%.

  1. Correlation between Garage area and Sales Price
cor.test(numeric_train$GarageArea,numeric_train$SalePrice,method = "pearson",conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_train$GarageArea and numeric_train$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5945883 0.6506720
## sample estimates:
##       cor 
## 0.6234314

The correlation between Sale Price and Garage area is not 0, therefore we reject null hypothesis.The correlation coeficient is 0.62. The 92 % confidence interval is between 59 % to 65%.

  1. Correlation between Garage area and GrLivArea
cor.test(numeric_train$GarageArea,numeric_train$GrLivArea,method = "pearson",conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  numeric_train$GarageArea and numeric_train$GrLivArea
## t = 20.276, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.4324608 0.5039965
## sample estimates:
##       cor 
## 0.4689975

Th correlation coefficent in above case is lower as compare to other two but we can still reject null hypothesis.

Family wise error:

family-wise error rate (FWER) is the probability of making one or more false discoveries, or type I errors when performing multiple hypotheses tests. In our case study there is much possibilty that variables are dependent on each other and hence will give “false positive” finding.

Linear Algebra and Correlation :

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#Inverting the correlation matrix:
Inv_cor <- solve(correlation)
Inv_cor
##              GrLivArea  GarageArea  SalePrice
## GrLivArea   2.01353305 -0.08964955 -1.3709485
## GarageArea -0.08964955  1.63976057 -0.9587504
## SalePrice  -1.37094845 -0.95875043  2.5692028
# Multiply the precision matrix by correlation matrix
Inv_cor %*% correlation
##            GrLivArea    GarageArea SalePrice
## GrLivArea          1  0.000000e+00         0
## GarageArea         0  1.000000e+00         0
## SalePrice          0 -2.220446e-16         1
# Multiply the correlation matrix by precision matrix
correlation %*% Inv_cor
##                GrLivArea GarageArea     SalePrice
## GrLivArea   1.000000e+00          0  0.000000e+00
## GarageArea -1.110223e-16          1 -2.220446e-16
## SalePrice  -2.220446e-16          0  1.000000e+00

Lower Upper Decomposition

LU(correlation %*% Inv_cor)
## $lower_triangular_matrix
##      GrLivArea GarageArea     SalePrice
## [1,]         1          0  0.000000e+00
## [2,]         0          1 -2.220446e-16
## [3,]         0          0  1.000000e+00
LU(Inv_cor %*% correlation )
## $lower_triangular_matrix
##      GrLivArea GarageArea SalePrice
## [1,]         1          0         0
## [2,]         0          1         0
## [3,]         0          0         1

Calculus-Based Probability & Statistics.

TotalBsmtSF <- as.numeric(train_data$TotalBsmtSF)
min(train_data$TotalBsmtSF)
## [1] 0

Derive the exponential distribution

fit <- fitdistr(TotalBsmtSF,"exponential")
lambda <- fit$estimate
lambda
##         rate 
## 0.0009456896

create the sample of 1000

sample <- rexp(1000,lambda)
hist(sample, freq = FALSE, breaks = 100, main ="sample vs observed", xlim = c(1, quantile(sample, 0.99)))
curve(dexp(x, rate = lambda), col = "red", add = TRUE)

hist(X,freq = FALSE, breaks = 100, main = "Observed Plot for Total basement square foot")

Using exponential pdf,find 5th and 95th percentiles

# Simulated Percentile
perc5 <- log(1 - 0.05)/-lambda
perc5
##     rate 
## 54.23904
perc95 <- log(1 - 0.95)/-lambda
perc95
##     rate 
## 3167.776
# Observed Percentile 
Operc5 <- quantile(train_data$TotalBsmtSF,0.05)
Operc5
##    5% 
## 519.3
Operc95 <- quantile(train_data$TotalBsmtSF,0.95)
Operc95
##  95% 
## 1753

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Data wrangling :

Multivariable regression model
model <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces  + GarageCars + WoodDeckSF + OpenPorchSF + EnclosedPorch , data = numeric_train)

summary(model)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + 
##     BsmtFinSF1 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + 
##     GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + BedroomAbvGr + 
##     KitchenAbvGr + TotRmsAbvGrd + Fireplaces + GarageCars + WoodDeckSF + 
##     OpenPorchSF + EnclosedPorch, data = numeric_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -347438  -19593   -2575   15973  360166 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.279e+06  1.644e+05  -7.778 1.61e-14 ***
## MSSubClass    -1.525e+02  3.360e+01  -4.538 6.26e-06 ***
## LotFrontage   -5.358e+01  5.770e+01  -0.929 0.353262    
## LotArea        7.629e-01  1.574e-01   4.846 1.43e-06 ***
## OverallQual    1.869e+04  1.424e+03  13.124  < 2e-16 ***
## OverallCond    4.211e+03  1.273e+03   3.308 0.000969 ***
## YearBuilt      2.228e+02  6.864e+01   3.247 0.001201 ** 
## YearRemodAdd   1.888e+02  7.852e+01   2.404 0.016379 *  
## MasVnrArea     4.779e+01  6.843e+00   6.985 4.78e-12 ***
## BsmtFinSF1     1.784e+03  5.177e+02   3.446 0.000588 ***
## BsmtUnfSF     -5.559e+02  8.822e+02  -0.630 0.528707    
## TotalBsmtSF    4.795e+02  1.411e+03   0.340 0.733984    
## X1stFlrSF      1.817e+04  1.260e+04   1.442 0.149663    
## X2ndFlrSF      5.485e+01  1.235e+03   0.044 0.964571    
## GrLivArea      4.499e+04  1.554e+04   2.896 0.003851 ** 
## BsmtFullBath   9.335e+03  2.945e+03   3.170 0.001564 ** 
## BsmtHalfBath   1.395e+03  4.914e+03   0.284 0.776538    
## FullBath       9.634e+03  3.091e+03   3.116 0.001876 ** 
## BedroomAbvGr  -1.104e+04  2.016e+03  -5.476 5.32e-08 ***
## KitchenAbvGr  -1.877e+04  6.041e+03  -3.107 0.001936 ** 
## TotRmsAbvGrd   8.126e+03  1.425e+03   5.704 1.48e-08 ***
## Fireplaces     5.304e+03  2.138e+03   2.481 0.013250 *  
## GarageCars     1.110e+04  1.965e+03   5.648 2.03e-08 ***
## WoodDeckSF     6.356e+02  4.690e+02   1.355 0.175554    
## OpenPorchSF    2.440e+01  1.800e+01   1.356 0.175414    
## EnclosedPorch  3.844e+02  7.224e+02   0.532 0.594780    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37490 on 1169 degrees of freedom
##   (265 observations deleted due to missingness)
## Multiple R-squared:  0.801,  Adjusted R-squared:  0.7968 
## F-statistic: 188.3 on 25 and 1169 DF,  p-value: < 2.2e-16

Predicting on test data

test_data$SalePrice <- predict(model,test_data,type = 'response')
Kaggle submission

Username : jagso

Score : 4.92801

headers = c("Id","SalePrice")
Kaggle_sub  = test_data[headers]
Kaggle_sub$SalePrice[is.na(Kaggle_sub$SalePrice)] <- 0
Kaggle_sub$SalePrice <- as.numeric(Kaggle_sub$SalePrice)
write.csv(Kaggle_sub,"Kaggle_sub.csv",row.names = FALSE)
Alt text

Alt text