library(knitr)
library(pracma)
library(kableExtra)
library(tidyr)
library(ggplot2)
library(data.table)
library(DT)
library(psych)
library(scales)
library(corrplot)
library(dplyr)
library(stringr)
library(matrixcalc)
library(MASS)

Problem 1

1.1 Random variables

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of (N+1)/2.

set.seed(1025)

N <- round(runif(1,6,100))
n <- 10000

# Generating Variable X
X <- runif(n, min=0, max=N)
hist(X)

# Generating variable y 
Y <- rnorm(n, (N+1)/2,(N+1)/2)
hist(Y)

1.2 Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x <- median(X)
round(x, 3)
## [1] 28.409
y <- quantile(Y, 0.25)[[1]]
round(y, 3)
## [1] 9.626
P1 <- sum(X > x & X>y)
P2 <- sum(X > y)

print(P1/P2)
## [1] 0.6003842
P3 <- sum(X>x & Y > y)/n
print(P3)
## [1] 0.3746
P4 <- sum(X<x & X>y)/n
print(P4)
## [1] 0.3328

1.3 Marginal and joint probabilities

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

# Creating matrix 
matrix<-matrix( c(sum(X>x & Y<y),sum(X>x & Y>y), sum(X<x & Y<y),sum(X<x & Y>y)), nrow = 2,ncol = 2)
matrix<-cbind(matrix,c(matrix[1,1]+matrix[1,2],matrix[2,1]+matrix[2,2]))
matrix<-rbind(matrix,c(matrix[1,1]+matrix[2,1],matrix[1,2]+matrix[2,2],matrix[1,3]+matrix[2,3]))
table <- as.data.frame(matrix)

# Editing row and column names
names(table) <- c("X > x", "X < x", "Overall")
row.names(table) <- c("Y < y", "Y > y", "Overall")
head(table) %>% kable() %>% kable_styling()
X > x X < x Overall
Y < y 1254 1246 2500
Y > y 3746 3754 7500
Overall 5000 5000 10000
# computing probability from the above table

matrix_probability <- matrix / matrix[3,3]
matrix_probability <- as.data.frame(matrix_probability)
names(matrix_probability) <- c("X > x", "X < x", "Overall")
row.names(matrix_probability ) <- c("Y < y", "Y > y", "Overall")
head(matrix_probability) %>% kable() %>% kable_styling()
X > x X < x Overall
Y < y 0.1254 0.1246 0.25
Y > y 0.3746 0.3754 0.75
Overall 0.5000 0.5000 1.00
# Now let's if both probabilities are equal

prob1 <- matrix_probability[3,1]* matrix_probability[2,3] 
prob1 
## [1] 0.375
prob2 <- round(matrix_probability[2,1],3)
prob2 
## [1] 0.375
# Computing for P(X>x and Y>y)=P(X>x)P(Y>y)
prob1 == prob2
## [1] TRUE

Hence, it is proved that both sides of probabilities are equal as asked in question.

1.4 Fisher test and chi-sq for independence

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

H0 : All events are independent Ha: Both events are dependent

# Fisher's Exact test
fisher <- fisher.test(table, simulate.p.value = TRUE)
fisher
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based
##  on 2000 replicates)
## 
## data:  table
## p-value = 1
## alternative hypothesis: two.sided
# Chi-square test
chisq <- chisq.test(table)
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  table
## X-squared = 0.034133, df = 4, p-value = 0.9999

p-values for Fisher test and chi-square test are 1 and 0.999 which is extremely insignificant. Hence, it rejects the null hypothesis and accepts the Ha which means both events are dependent as we previously confirmed in last question. It’s proved here through fisher test and chi-square as well.

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

2.1 Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

# Reading the dataset
train <- read.csv("C:/Users/hukha/Desktop/MS - Data Science/Data 605 - Computational Mathematics/FinalExam-dataset/train.csv", header = TRUE)

Descriptive summary

# Descriptive statistics of TRAIN dataset
summary(train) 
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9478               NA's:1369   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards:100   RRAn   :  26  
##                Inside :1052              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000  
##  (Other):   2                 (Other): 19                   
##   OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                              
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   None   :864   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   Stone  :128   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   NA's   :  8   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation  BsmtQual   BsmtCond    BsmtExposure
##  Ex: 52    Ex:   3   BrkTil:146   Ex  :121   Fa  :  45   Av  :221    
##  Fa: 14    Fa:  28   CBlock:634   Fa  : 35   Gd  :  65   Gd  :134    
##  Gd:488    Gd: 146   PConc :647   Gd  :618   Po  :   2   Mn  :114    
##  TA:906    Po:   1   Slab  : 24   TA  :649   TA  :1311   No  :953    
##            TA:1282   Stone :  6   NA's: 37   NA's:  37   NA's: 38    
##                      Wood  :  3                                      
##                                                                      
##  BsmtFinType1   BsmtFinSF1     BsmtFinType2   BsmtFinSF2     
##  ALQ :220     Min.   :   0.0   ALQ :  19    Min.   :   0.00  
##  BLQ :148     1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00  
##  GLQ :418     Median : 383.5   GLQ :  14    Median :   0.00  
##  LwQ : 74     Mean   : 443.6   LwQ :  46    Mean   :  46.55  
##  Rec :133     3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00  
##  Unf :430     Max.   :5644.0   Unf :1256    Max.   :1474.00  
##  NA's: 37                      NA's:  38                     
##    BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC CentralAir
##  Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741    N:  95    
##  1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365    
##  Median : 477.5   Median : 991.5   GasW :  18   Gd:241              
##  Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1              
##  3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428              
##  Max.   :2336.0   Max.   :6110.0   Wall :   4                       
##                                                                     
##  Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##  Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                              
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100     
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39     
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586     
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735     
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000              
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000              
##                                                              
##   TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu   GarageType 
##  Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6  
##  1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870  
##  Median : 6.000   Min1:  31   Median :1.000   Gd  :380    Basment: 19  
##  Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88  
##  3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9  
##  Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690    Detchd :387  
##                   Typ :1360                               NA's   : 81  
##   GarageYrBlt   GarageFinish   GarageCars      GarageArea     GarageQual 
##  Min.   :1900   Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3  
##  1st Qu.:1961   RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48  
##  Median :1980   Unf :605     Median :2.000   Median : 480.0   Gd  :  14  
##  Mean   :1979   NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3  
##  3rd Qu.:2002                3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311  
##  Max.   :2010                Max.   :4.000   Max.   :1418.0   NA's:  81  
##  NA's   :81                                                              
##  GarageCond  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex  :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa  :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd  :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  Po  :   7              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  TA  :1326              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  NA's:  81              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                           
##    X3SsnPorch      ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex  :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd  :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   NA's:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.00   Max.   :480.00   Max.   :738.000              
##                                                                 
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv:  59   Gar2:   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo :  54   Othr:   2   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv: 157   Shed:  49   Median :    0.00   Median : 6.000  
##  MnWw :  11   TenC:   1   Mean   :   43.49   Mean   : 6.322  
##  NA's :1179   NA's:1406   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :15500.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9
describe(train) %>% DT::datatable()

Scatterplot matrix

# Checking neighborhood on salesprice
ggplot(train) + geom_boxplot(aes(x=reorder(Neighborhood, SalePrice),y=SalePrice, fill=Neighborhood))+theme(axis.text.x = element_text(angle=60))+scale_y_continuous(label=dollar)

# Checking CentralAir's relationship on salesprice
ggplot(train) + geom_boxplot(aes(x=reorder(CentralAir, SalePrice),y=SalePrice, fill=CentralAir))+theme(axis.text.x = element_text(angle=60))+scale_y_continuous(label=dollar)

# Overallcondition on salesprice
ggplot(train, aes(x=LotArea, y=SalePrice)) + geom_point()+geom_smooth(method="lm")+ scale_y_continuous(label=comma)+xlim(0,40000)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 14 rows containing non-finite values (stat_smooth).
## Warning: Removed 14 rows containing missing values (geom_point).

According to visualizations, different neighborhood has different house prices which totally makes sense. Likewise, if house has central air then the price is comparatively higher than the houses with no central air. LotArea also seems to have positive relationship with SalePrice but there are also other factors that might influence the SalePrice.

Correlation matrix

Now let’s create a correlation matrix for three quantitative independent variables

# Filtering quantitative variables

# Creating correlation matrix and plot
train_cor <- train %>% dplyr::select(LotArea, OverallCond, OverallQual, TotalBsmtSF, SalePrice)

train_m <- cor(train_cor, method="pearson", use="complete.obs")
train_m
##                 LotArea OverallCond OverallQual TotalBsmtSF   SalePrice
## LotArea      1.00000000 -0.00563627  0.10580574   0.2608331  0.26384335
## OverallCond -0.00563627  1.00000000 -0.09193234  -0.1710975 -0.07785589
## OverallQual  0.10580574 -0.09193234  1.00000000   0.5378085  0.79098160
## TotalBsmtSF  0.26083313 -0.17109751  0.53780850   1.0000000  0.61358055
## SalePrice    0.26384335 -0.07785589  0.79098160   0.6135806  1.00000000
# Creating correlation plot
corrplot(train_m, type="upper", order="hclust", tl.col="black", tl.srt=45, addCoef.col = "white")

Correlation matrix shows that all variables have correlation with SalePrice. OverallCond does not have very significant impact on Sales Price but LotArea, TotalBsmntSF and OverallQual have positive relationship with Sales Price.

Hypothesis testing

OverallCond and SalePrice

# Checking relationship between overallCond and SalePrice
cor.test(train_cor$SalePrice, train_cor$OverallCond, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_cor$SalePrice and train_cor$OverallCond
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.1111272 -0.0444103
## sample estimates:
##         cor 
## -0.07785589

There is very little correlation between OverallCond and SalePrice as correlation value is -0.077 but p-value is 0.002912 which shows significance relationship between the said variables.

LotArea and SalePrice

cor.test(train_cor$SalePrice, train_cor$LotArea, conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_cor$SalePrice and train_cor$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434

There is a significant correlation between LotArea and SalePrice but relationship is not very strong as it is almost 0.27 which shows weak relationship between SalePrice and OverallCond.

TotalBsmtSF and SalePrice

cor.test(train_cor$SalePrice, train_cor$TotalBsmtSF, conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_cor$SalePrice and train_cor$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806

TotalBsmtSF has significantly positive relationship with Sales Price. It makes sense that as the size of basement square feet increases, house price also goes up.

OVerallQual and Sales Price

cor.test(train_cor$SalePrice, train_cor$OverallQual, conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_cor$SalePrice and train_cor$OverallQual
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7780752 0.8032204
## sample estimates:
##       cor 
## 0.7909816

Overall Quality has also high positive relationship with Sales Price. Strength of relationship is 0.79 which is quite strong.

Family-wise error

It refers to the prbability of making one or more false discoveries. It is also known as Type I error which occurs during hypotheses testing. I would not worry about Type I error as the p-values are very low.

2.2 Linear algebra and correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Inverting the correlation matrix

# Existing correlation matrix
train_m
##                 LotArea OverallCond OverallQual TotalBsmtSF   SalePrice
## LotArea      1.00000000 -0.00563627  0.10580574   0.2608331  0.26384335
## OverallCond -0.00563627  1.00000000 -0.09193234  -0.1710975 -0.07785589
## OverallQual  0.10580574 -0.09193234  1.00000000   0.5378085  0.79098160
## TotalBsmtSF  0.26083313 -0.17109751  0.53780850   1.0000000  0.61358055
## SalePrice    0.26384335 -0.07785589  0.79098160   0.6135806  1.00000000
# Inverting correlation matrix
train_inverted <- inv(train_m)
train_inverted
##                 LotArea OverallCond OverallQual TotalBsmtSF  SalePrice
## LotArea      1.13583178 -0.03339221  0.33967628  -0.2154341 -0.4387729
## OverallCond -0.03339221  1.03354091  0.04575494   0.2058735 -0.0732337
## OverallQual  0.33967628  0.04575494  2.80779485  -0.2811661 -2.1344551
## TotalBsmtSF -0.21543414  0.20587345 -0.28116610   1.7023710 -0.7492752
## SalePrice   -0.43877294 -0.07323370 -2.13445505  -0.7492752  3.2581210

Precision and correlation matrices

# Multiplying the correlation matrix with inverted
p1 <- train_m %*% train_inverted
round(p1)
##             LotArea OverallCond OverallQual TotalBsmtSF SalePrice
## LotArea           1           0           0           0         0
## OverallCond       0           1           0           0         0
## OverallQual       0           0           1           0         0
## TotalBsmtSF       0           0           0           1         0
## SalePrice         0           0           0           0         1
# Mulitplying inverted with matrix
p2 <- train_inverted %*% train_m
round(p2)
##             LotArea OverallCond OverallQual TotalBsmtSF SalePrice
## LotArea           1           0           0           0         0
## OverallCond       0           1           0           0         0
## OverallQual       0           0           1           0         0
## TotalBsmtSF       0           0           0           1         0
## SalePrice         0           0           0           0         1

Both results in the identity matrix as we can see 1 in diagonal in both matrices.

LU Decomposition

train_lu <- lu.decomposition(train_m)
train_lu 
## $L
##             [,1]        [,2]      [,3]      [,4] [,5]
## [1,]  1.00000000  0.00000000 0.0000000 0.0000000    0
## [2,] -0.00563627  1.00000000 0.0000000 0.0000000    0
## [3,]  0.10580574 -0.09133889 1.0000000 0.0000000    0
## [4,]  0.26083313 -0.16963278 0.5045754 1.0000000    0
## [5,]  0.26384335 -0.07637123 0.7711564 0.2299716    1
## 
## $U
##      [,1]        [,2]        [,3]       [,4]       [,5]
## [1,]    1 -0.00563627  0.10580574  0.2608331  0.2638434
## [2,]    0  0.99996823 -0.09133599 -0.1696274 -0.0763688
## [3,]    0  0.00000000  0.98046262  0.4947173  0.7560900
## [4,]    0  0.00000000  0.00000000  0.6535696  0.1503024
## [5,]    0  0.00000000  0.00000000  0.0000000  0.3069254
# Checking if decomposition results in original correlation matrix
train_lu2 <- train_lu$L %*% train_lu$U
round(train_lu2,4) == round(train_m, 4)
##             LotArea OverallCond OverallQual TotalBsmtSF SalePrice
## LotArea        TRUE        TRUE        TRUE        TRUE      TRUE
## OverallCond    TRUE        TRUE        TRUE        TRUE      TRUE
## OverallQual    TRUE        TRUE        TRUE        TRUE      TRUE
## TotalBsmtSF    TRUE        TRUE        TRUE        TRUE      TRUE
## SalePrice      TRUE        TRUE        TRUE        TRUE      TRUE

2.3 Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of  for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Select a variable and shift

# Value of Miscellaneous feature
hist(train$SalePrice, main="House Sales Price", col="red")

# Transforming variable
hist_t <- abs(scale(train$SalePrice))
hist(hist_t, main="Exponential - Sales Price", col="blue")

Fit Exponential Probability Density Function

train_fit <- fitdistr(hist_t, densfun="exponential")
train_fitest <- train_fit$estimate
train_fitest
##     rate 
## 1.383178
# Take sample of 1000
hist(rexp(1000, train_fit$estimate), breaks=100, main="Fitted Exponential Probability Density Function")

# Comparing it with original variable i.e. SalePrice
hist(train$SalePrice, breaks=100, main="Sales Price")

After fitting to exponential probability density function, rate of change is 1.383178.

# 5th and 95th percentile using cumulative distribution function
paste0("The 5th percentile is ", qexp(0.05, rate= train_fitest, log.p = FALSE, lower.tail = TRUE))
## [1] "The 5th percentile is 0.0370836576509109"
paste0("The 95 percentile is ", qexp(0.95, rate=train_fitest, log.p=FALSE, lower.tail = TRUE))
## [1] "The 95 percentile is 2.16583300746665"
# now let's check the original variable and compare them
sp_mean <- mean(train$SalePrice)
sp_sd <- sd(train$SalePrice)
qnorm(0.95, sp_mean, sp_sd)
## [1] 311592.5
# now let's check out 5th and 95th percentile for SalesPrice
quantile(train$SalePrice, c(0.05,0.95))
##     5%    95% 
##  88000 326100

2.4 Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

# Selecting only variables that are numeric
train_variables <- select_if(train, is.numeric)

# Creating a model
train_model <- lm(data=train_variables, SalePrice ~ .)
summary(train_model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train_variables)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442182  -16955   -2824   15125  318183 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.351e+05  1.701e+06  -0.197 0.843909    
## Id            -1.205e+00  2.658e+00  -0.453 0.650332    
## MSSubClass    -2.001e+02  3.451e+01  -5.797 8.84e-09 ***
## LotFrontage   -1.160e+02  6.126e+01  -1.894 0.058503 .  
## LotArea        5.422e-01  1.575e-01   3.442 0.000599 ***
## OverallQual    1.866e+04  1.482e+03  12.592  < 2e-16 ***
## OverallCond    5.239e+03  1.368e+03   3.830 0.000135 ***
## YearBuilt      3.164e+02  8.766e+01   3.610 0.000321 ***
## YearRemodAdd   1.194e+02  8.668e+01   1.378 0.168607    
## MasVnrArea     3.141e+01  7.022e+00   4.473 8.54e-06 ***
## BsmtFinSF1     1.736e+01  5.838e+00   2.973 0.003014 ** 
## BsmtFinSF2     8.342e+00  8.766e+00   0.952 0.341532    
## BsmtUnfSF      5.005e+00  5.277e+00   0.948 0.343173    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.597e+01  7.360e+00   6.246 6.02e-10 ***
## X2ndFlrSF      4.663e+01  6.102e+00   7.641 4.72e-14 ***
## LowQualFinSF   3.341e+01  2.794e+01   1.196 0.232009    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   9.043e+03  3.198e+03   2.828 0.004776 ** 
## BsmtHalfBath   2.465e+03  5.073e+03   0.486 0.627135    
## FullBath       5.433e+03  3.531e+03   1.539 0.124182    
## HalfBath      -1.098e+03  3.321e+03  -0.331 0.740945    
## BedroomAbvGr  -1.022e+04  2.155e+03  -4.742 2.40e-06 ***
## KitchenAbvGr  -2.202e+04  6.710e+03  -3.282 0.001063 ** 
## TotRmsAbvGrd   5.464e+03  1.487e+03   3.674 0.000251 ***
## Fireplaces     4.372e+03  2.189e+03   1.998 0.046020 *  
## GarageYrBlt   -4.728e+01  9.106e+01  -0.519 0.603742    
## GarageCars     1.685e+04  3.491e+03   4.827 1.58e-06 ***
## GarageArea     6.274e+00  1.213e+01   0.517 0.605002    
## WoodDeckSF     2.144e+01  1.002e+01   2.139 0.032662 *  
## OpenPorchSF   -2.252e+00  1.949e+01  -0.116 0.907998    
## EnclosedPorch  7.295e+00  2.062e+01   0.354 0.723590    
## X3SsnPorch     3.349e+01  3.758e+01   0.891 0.373163    
## ScreenPorch    5.805e+01  2.041e+01   2.844 0.004532 ** 
## PoolArea      -6.052e+01  2.990e+01  -2.024 0.043204 *  
## MiscVal       -3.761e+00  6.960e+00  -0.540 0.589016    
## MoSold        -2.217e+02  4.229e+02  -0.524 0.600188    
## YrSold        -2.474e+02  8.458e+02  -0.293 0.769917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36800 on 1085 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8096, Adjusted R-squared:  0.8034 
## F-statistic: 131.8 on 35 and 1085 DF,  p-value: < 2.2e-16

Although Adjusted R-square is 0.8034 right now but there are lots of insignificant variables in the model which we have to exclude. Final model is below after elimination of variables from the model. The final adjusted r-square is 0.7703 which shows that the model is strong.

# Modifying the model after eliminating insignificant variables
train_model2 <- lm(data=train_variables, SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFullBath + TotRmsAbvGrd + Fireplaces + WoodDeckSF +ScreenPorch)

summary(train_model2)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFullBath + 
##     TotRmsAbvGrd + Fireplaces + WoodDeckSF + ScreenPorch, data = train_variables)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -404302  -20459   -3061   15678  360505 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.299e+05  9.152e+04 -10.161  < 2e-16 ***
## MSSubClass   -1.670e+02  2.402e+01  -6.956 5.31e-12 ***
## LotArea       6.433e-01  1.077e-01   5.971 2.97e-09 ***
## OverallQual   2.731e+04  1.074e+03  25.441  < 2e-16 ***
## OverallCond   3.861e+03  9.942e+02   3.884 0.000107 ***
## YearBuilt     4.174e+02  4.651e+01   8.976  < 2e-16 ***
## MasVnrArea    4.421e+01  6.317e+00   6.998 3.97e-12 ***
## BsmtFinSF1    1.901e+01  3.084e+00   6.163 9.24e-10 ***
## BsmtFullBath  7.064e+03  2.574e+03   2.744 0.006141 ** 
## TotRmsAbvGrd  1.174e+04  7.277e+02  16.136  < 2e-16 ***
## Fireplaces    8.904e+03  1.828e+03   4.870 1.24e-06 ***
## WoodDeckSF    3.895e+01  8.562e+00   4.549 5.84e-06 ***
## ScreenPorch   6.105e+01  1.838e+01   3.321 0.000919 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38000 on 1439 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7722, Adjusted R-squared:  0.7703 
## F-statistic: 406.4 on 12 and 1439 DF,  p-value: < 2.2e-16

F-statistics shows that the model is significant.

# Checking normality
par(mfrow=c(2,2))
plot(train_model2)

Plots above show that data is normal with some outliers but statistically the model is significant.

# Reading the test data
test <- read.csv("C:/Users/hukha/Desktop/MS - Data Science/Data 605 - Computational Mathematics/FinalExam-dataset/test.csv", header = TRUE)

# Predicting the test data
train_pred <- predict(train_model2, test)

# Creating dataframe to save to kaggle
train_pred2 <- data.frame(Id = test[,"Id"], SalePrice = train_pred)

# Replacing missing values
train_pred2 <- replace(train_pred2, is.na(train_pred2),0)

# Writing csv file
write.csv(train_pred2, "kaggle_pred.csv")