libraries

library(psych)
## Warning: package 'psych' was built under R version 4.0.5
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.88 loaded
library(matrixcalc)
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.4
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5

Problem 1

n = 10000
N = 6

x = runif(n,min=1,max=N)
y = rnorm(n,mean = ((N+1)/2),sd = ((N+1)/2))

prob_table = data.frame(x,y)

x_median  = median(prob_table$x)
y_quartile = quantile(prob_table$y,.25)

a) P(X>x|X>y)

p_a = nrow((subset(prob_table,prob_table$x>y_quartile & prob_table$x>x_median)))/
  nrow((subset(prob_table,prob_table$x>y_quartile)))

print(paste("the probability for question 1a is ",p_a))
## [1] "the probability for question 1a is  0.513980263157895"

b) P(X>x,Y>y)

p_b = nrow((subset(prob_table,prob_table$x>x_median & prob_table$y>y_quartile)))/n

print(paste("the probability for question 1b is ",p_b))
## [1] "the probability for question 1b is  0.3772"

c) P(X<x | X>y)

p_c = nrow((subset(prob_table,prob_table$x<x_median & prob_table$x>y_quartile)))/
  nrow((subset(prob_table,prob_table$x>y_quartile)))

print(paste("the probability for question 1c is ",p_c))
## [1] "the probability for question 1c is  0.486019736842105"

Probability tables and independence testing

probs_v = c(nrow(subset(prob_table,prob_table$x>x_median & prob_table$y>y_quartile))/n,
            nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y>y_quartile))/n,
            nrow(subset(prob_table,prob_table$x>x_median & prob_table$y<=y_quartile))/n,
            nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y<=y_quartile))/n)

prob_mat = matrix(probs_v,2,2)



rownames(prob_mat) = c("x>x_median","x<=x_median")
colnames(prob_mat) = c("y>y_quartile","y<=y_quartile")

print(prob_mat)
##             y>y_quartile y<=y_quartile
## x>x_median        0.3772        0.1228
## x<=x_median       0.3728        0.1272
print(identical(round(prob_mat[1],2),round((nrow(subset(prob_table,prob_table$x>x_median))/n)*
  (nrow(subset(prob_table,prob_table$y>y_quartile))/n),2)))
## [1] TRUE
count_v =c(nrow(subset(prob_table,prob_table$x>x_median & prob_table$y>y_quartile)),
  nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y>y_quartile)),
  nrow(subset(prob_table,prob_table$x>x_median & prob_table$y<=y_quartile)),
  nrow(subset(prob_table,prob_table$x<=x_median & prob_table$y<=y_quartile)))

print(count_v)
## [1] 3772 3728 1228 1272
print(chisq.test(matrix(count_v,2,2))$p.value)
## [1] 0.3206893
print(fisher.test(matrix(count_v,2,2),simulate.p.value = TRUE)$p.value)
## [1] 0.3206894

P-values are nearly identical. Given that neither is below .05 we cannot reject the null hypothesis and thus confirm independence.

The chi-squared test applies an approximation assuming the sample is large. The Fisher’s exact test runs an exact procedure especially for small-sized samples. In this case the chi-squared is more appropriate given the size of our sample set.

Problem 2

Descriptive and Inferential Statistics

gen stats

print(summary(train_set))
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
print(colnames(train_set))
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

univariate stats

dep_var = train_set$SalePrice
ind_var1 = train_set$LotArea
ind_var2 = train_set$GarageArea

print(summary(dep_var))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
print(summary(ind_var1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
print(summary(ind_var2))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   334.5   480.0   473.0   576.0  1418.0
print(describe(dep_var))
##    vars    n     mean      sd median  trimmed     mad   min    max  range skew
## X1    1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100 1.88
##    kurtosis      se
## X1      6.5 2079.11
print(describe(ind_var1))
##    vars    n     mean      sd median trimmed     mad  min    max  range  skew
## X1    1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945 12.18
##    kurtosis     se
## X1   202.26 261.22
print(describe(ind_var2))
##    vars    n   mean    sd median trimmed    mad min  max range skew kurtosis
## X1    1 1460 472.98 213.8    480  469.81 177.91   0 1418  1418 0.18      0.9
##     se
## X1 5.6

univariate scatter plots

par(mfrow=c(1,1))
plot(dep_var)

plot(ind_var1)

plot(ind_var2)

hist/bar plots

par(mfrow=c(1,2))
boxplot(dep_var, main="SalesPrice BoxPlot")
hist(dep_var, breaks = 20, main = "SalesPrice Histogram")

boxplot(ind_var1, main="LotArea BoxPlot")
hist(ind_var1, breaks = 20, main = "LotArea Histogram")

boxplot(ind_var2, main="GarageArea BoxPlot")
hist(ind_var2, breaks = 20, main = "GarageArea Histogram")

relational plots

plot(ind_var1~dep_var)

plot(ind_var2~dep_var)

cor(train_set[,c('SalePrice','LotArea','GarageArea')])
##            SalePrice   LotArea GarageArea
## SalePrice  1.0000000 0.2638434  0.6234314
## LotArea    0.2638434 1.0000000  0.1804028
## GarageArea 0.6234314 0.1804028  1.0000000
par(mfrow=c(1,1))
corrplot(cor(train_set[,c('SalePrice','LotArea','GarageArea')]),method='circle')

Pairwise Analysis

cor.test(train_set$SalePrice,train_set$LotArea, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_set$SalePrice and train_set$LotArea
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434
cor.test(train_set$SalePrice,train_set$GarageArea, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_set$SalePrice and train_set$GarageArea
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314
cor.test(train_set$LotArea,train_set$GarageArea, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train_set$LotArea and train_set$GarageArea
## t = 7.0034, df = 1458, p-value = 3.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.1477356 0.2126767
## sample estimates:
##       cor 
## 0.1804028

Given that the p-values are all statistically significant we can reject the null hypothesis. This indicates that true correlation is not equal to 0. This sets the stage for familywise errors as correlations will most likely exist across the dataset and we can falsely derive relationships that may not exist.

Linear Algebra and Correlation

cormat = cor(train_set[,c('SalePrice','LotArea','GarageArea')])
inversecormat = solve(cormat)

print(cormat)
##            SalePrice   LotArea GarageArea
## SalePrice  1.0000000 0.2638434  0.6234314
## LotArea    0.2638434 1.0000000  0.1804028
## GarageArea 0.6234314 0.1804028  1.0000000
print(inversecormat)
##             SalePrice     LotArea  GarageArea
## SalePrice   1.7016986 -0.26625940 -1.01285847
## LotArea    -0.2662594  1.07530074 -0.02799273
## GarageArea -1.0128585 -0.02799273  1.63649778
print(cormat %*% inversecormat)
##            SalePrice      LotArea GarageArea
## SalePrice          1 2.428613e-17          0
## LotArea            0 1.000000e+00          0
## GarageArea         0 3.469447e-17          1
print(inversecormat %*% cormat)
##               SalePrice       LotArea   GarageArea
## SalePrice  1.000000e+00 -5.551115e-17 0.000000e+00
## LotArea    7.979728e-17  1.000000e+00 6.591949e-17
## GarageArea 0.000000e+00  0.000000e+00 1.000000e+00

Matrix Decomposition

print(matrixcalc::lu.decomposition(cormat)$L)
##           [,1]       [,2] [,3]
## [1,] 1.0000000 0.00000000    0
## [2,] 0.2638434 1.00000000    0
## [3,] 0.6234314 0.01710527    1
print(matrixcalc::lu.decomposition(cormat)$U)
##      [,1]      [,2]       [,3]
## [1,]    1 0.2638434 0.62343144
## [2,]    0 0.9303867 0.01591451
## [3,]    0 0.0000000 0.61106102
print(matrixcalc::lu.decomposition(cormat)$L %*% matrixcalc::lu.decomposition(cormat)$U==cormat)
##            SalePrice LotArea GarageArea
## SalePrice       TRUE    TRUE       TRUE
## LotArea         TRUE    TRUE       TRUE
## GarageArea      TRUE    TRUE       TRUE
print(matrixcalc::lu.decomposition(inversecormat)$L %*% matrixcalc::lu.decomposition(inversecormat)$U)
##            [,1]        [,2]        [,3]
## [1,]  1.7016986 -0.26625940 -1.01285847
## [2,] -0.2662594  1.07530074 -0.02799273
## [3,] -1.0128585 -0.02799273  1.63649778

Calculus-Based Probability and Stats

paste0("minimum value is below 0: ",min(ind_var2)<0)
## [1] "minimum value is below 0: FALSE"
exp_fit_dist_cacl = MASS::fitdistr(ind_var2,densfun = 'exponential')

exp_sample = rexp(1000,exp_fit_dist_cacl$estimate)

par(mfrow=c(1,2))
hist(ind_var2)
hist(exp_sample)

quantile(exp_sample,probs = c(.05,.95))
##         5%        95% 
##   21.09845 1472.69634

calcs assuming normality

mu_indvar2 = mean(ind_var2)
sigma_indvar2 = sd(ind_var2)
count_indvar2 = length(ind_var2)

standard_error = qnorm(.95) * sigma_indvar2/sqrt(count_indvar2)

print(paste0("confidence interval 95% range: ",mu_indvar2-standard_error," to ",mu_indvar2+standard_error))
## [1] "confidence interval 95% range: 463.776311738512 to 482.18396223409"
print(quantile(ind_var2,c(.05,.95)))
##    5%   95% 
##   0.0 850.1

Modeling

for (i in colnames(train_set)){
  if((sum(is.na(train_set[i]))/nrow(train_set))>=.5){
    train_set[i] = NULL
  }}


summary(train_set)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street            LotShape         LandContour       
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##   Utilities          LotConfig          LandSlope         Neighborhood      
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Condition1         Condition2          BldgType          HouseStyle       
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##   RoofStyle           RoofMatl         Exterior1st        Exterior2nd       
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   MasVnrType          MasVnrArea      ExterQual          ExterCond        
##  Length:1460        Min.   :   0.0   Length:1460        Length:1460       
##  Class :character   1st Qu.:   0.0   Class :character   Class :character  
##  Mode  :character   Median :   0.0   Mode  :character   Mode  :character  
##                     Mean   : 103.7                                        
##                     3rd Qu.: 166.0                                        
##                     Max.   :1600.0                                        
##                     NA's   :8                                             
##   Foundation          BsmtQual           BsmtCond         BsmtExposure      
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtFinType1         BsmtFinSF1     BsmtFinType2         BsmtFinSF2     
##  Length:1460        Min.   :   0.0   Length:1460        Min.   :   0.00  
##  Class :character   1st Qu.:   0.0   Class :character   1st Qu.:   0.00  
##  Mode  :character   Median : 383.5   Mode  :character   Median :   0.00  
##                     Mean   : 443.6                      Mean   :  46.55  
##                     3rd Qu.: 712.2                      3rd Qu.:   0.00  
##                     Max.   :5644.0                      Max.   :1474.00  
##                                                                          
##    BsmtUnfSF       TotalBsmtSF       Heating           HeatingQC        
##  Min.   :   0.0   Min.   :   0.0   Length:1460        Length:1460       
##  1st Qu.: 223.0   1st Qu.: 795.8   Class :character   Class :character  
##  Median : 477.5   Median : 991.5   Mode  :character   Mode  :character  
##  Mean   : 567.2   Mean   :1057.4                                        
##  3rd Qu.: 808.0   3rd Qu.:1298.2                                        
##  Max.   :2336.0   Max.   :6110.0                                        
##                                                                         
##   CentralAir         Electrical          X1stFlrSF      X2ndFlrSF   
##  Length:1460        Length:1460        Min.   : 334   Min.   :   0  
##  Class :character   Class :character   1st Qu.: 882   1st Qu.:   0  
##  Mode  :character   Mode  :character   Median :1087   Median :   0  
##                                        Mean   :1163   Mean   : 347  
##                                        3rd Qu.:1391   3rd Qu.: 728  
##                                        Max.   :4692   Max.   :2065  
##                                                                     
##   LowQualFinSF       GrLivArea     BsmtFullBath     BsmtHalfBath    
##  Min.   :  0.000   Min.   : 334   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.000   Median :1464   Median :0.0000   Median :0.00000  
##  Mean   :  5.845   Mean   :1515   Mean   :0.4253   Mean   :0.05753  
##  3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :572.000   Max.   :5642   Max.   :3.0000   Max.   :2.00000  
##                                                                     
##     FullBath        HalfBath       BedroomAbvGr    KitchenAbvGr  
##  Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :2.000   Median :0.0000   Median :3.000   Median :1.000  
##  Mean   :1.565   Mean   :0.3829   Mean   :2.866   Mean   :1.047  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000  
##  Max.   :3.000   Max.   :2.0000   Max.   :8.000   Max.   :3.000  
##                                                                  
##  KitchenQual         TotRmsAbvGrd     Functional          Fireplaces   
##  Length:1460        Min.   : 2.000   Length:1460        Min.   :0.000  
##  Class :character   1st Qu.: 5.000   Class :character   1st Qu.:0.000  
##  Mode  :character   Median : 6.000   Mode  :character   Median :1.000  
##                     Mean   : 6.518                      Mean   :0.613  
##                     3rd Qu.: 7.000                      3rd Qu.:1.000  
##                     Max.   :14.000                      Max.   :3.000  
##                                                                        
##  FireplaceQu         GarageType         GarageYrBlt   GarageFinish      
##  Length:1460        Length:1460        Min.   :1900   Length:1460       
##  Class :character   Class :character   1st Qu.:1961   Class :character  
##  Mode  :character   Mode  :character   Median :1980   Mode  :character  
##                                        Mean   :1979                     
##                                        3rd Qu.:2002                     
##                                        Max.   :2010                     
##                                        NA's   :81                       
##    GarageCars      GarageArea      GarageQual         GarageCond       
##  Min.   :0.000   Min.   :   0.0   Length:1460        Length:1460       
##  1st Qu.:1.000   1st Qu.: 334.5   Class :character   Class :character  
##  Median :2.000   Median : 480.0   Mode  :character   Mode  :character  
##  Mean   :1.767   Mean   : 473.0                                        
##  3rd Qu.:2.000   3rd Qu.: 576.0                                        
##  Max.   :4.000   Max.   :1418.0                                        
##                                                                        
##   PavedDrive          WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Length:1460        Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Class :character   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Median :  0.00   Median : 25.00   Median :  0.00  
##                     Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##                     3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##                     Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                       
##    X3SsnPorch      ScreenPorch        PoolArea          MiscVal        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   :    0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00  
##  Median :  0.00   Median :  0.00   Median :  0.000   Median :    0.00  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   Mean   :   43.49  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00  
##  Max.   :508.00   Max.   :480.00   Max.   :738.000   Max.   :15500.00  
##                                                                        
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
clean_training_data = train_set[, !sapply(train_set, is.character)]
clean_training_data = na.omit(clean_training_data)

## additional cleaning
clean_training_data$TotalBsmtSF=NULL
clean_training_data$GrLivArea=NULL
clean_training_data$OpenPorchSF=NULL
clean_training_data$YrSold=NULL
clean_training_data$HalfBath=NULL
clean_training_data$EnclosedPorch=NULL
clean_training_data$BsmtHalfBath=NULL

multi_reg = lm(SalePrice~.,data=clean_training_data)
summary(multi_reg)
## 
## Call:
## lm(formula = SalePrice ~ ., data = clean_training_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -443741  -17119   -2425   15105  317646 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.012e+05  1.628e+05  -4.922 9.91e-07 ***
## Id           -1.243e+00  2.652e+00  -0.469 0.639359    
## MSSubClass   -1.993e+02  3.434e+01  -5.803 8.52e-09 ***
## LotFrontage  -1.159e+02  6.098e+01  -1.900 0.057695 .  
## LotArea       5.394e-01  1.568e-01   3.439 0.000605 ***
## OverallQual   1.871e+04  1.468e+03  12.738  < 2e-16 ***
## OverallCond   5.204e+03  1.343e+03   3.874 0.000113 ***
## YearBuilt     3.015e+02  8.023e+01   3.758 0.000180 ***
## YearRemodAdd  1.214e+02  8.599e+01   1.412 0.158268    
## MasVnrArea    3.141e+01  6.979e+00   4.501 7.48e-06 ***
## BsmtFinSF1    1.761e+01  5.755e+00   3.059 0.002273 ** 
## BsmtFinSF2    8.551e+00  8.654e+00   0.988 0.323307    
## BsmtUnfSF     4.882e+00  5.225e+00   0.934 0.350342    
## X1stFlrSF     4.584e+01  7.337e+00   6.248 5.97e-10 ***
## X2ndFlrSF     4.556e+01  5.291e+00   8.610  < 2e-16 ***
## LowQualFinSF  3.331e+01  2.784e+01   1.197 0.231753    
## BsmtFullBath  8.660e+03  3.044e+03   2.845 0.004527 ** 
## FullBath      5.714e+03  3.186e+03   1.794 0.073138 .  
## BedroomAbvGr -1.006e+04  2.138e+03  -4.708 2.82e-06 ***
## KitchenAbvGr -2.212e+04  6.674e+03  -3.315 0.000947 ***
## TotRmsAbvGrd  5.400e+03  1.477e+03   3.655 0.000270 ***
## Fireplaces    4.353e+03  2.172e+03   2.004 0.045340 *  
## GarageYrBlt  -5.013e+01  9.069e+01  -0.553 0.580509    
## GarageCars    1.692e+04  3.456e+03   4.895 1.13e-06 ***
## GarageArea    6.461e+00  1.202e+01   0.538 0.590949    
## WoodDeckSF    2.158e+01  9.935e+00   2.172 0.030092 *  
## X3SsnPorch    3.359e+01  3.735e+01   0.899 0.368712    
## ScreenPorch   5.609e+01  2.008e+01   2.793 0.005315 ** 
## PoolArea     -5.846e+01  2.962e+01  -1.973 0.048709 *  
## MiscVal      -3.679e+00  6.911e+00  -0.532 0.594639    
## MoSold       -2.064e+02  4.143e+02  -0.498 0.618437    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36730 on 1090 degrees of freedom
## Multiple R-squared:  0.8094, Adjusted R-squared:  0.8042 
## F-statistic: 154.3 on 30 and 1090 DF,  p-value: < 2.2e-16
plot(fitted(multi_reg),resid(multi_reg))

qqnorm(resid(multi_reg))
qqline(resid(multi_reg))

prediction_data = test_set
prediction_data$SalePrice = predict(multi_reg,test_set)
plot(prediction_data$SalePrice)

plot(train_set$SalePrice)

kaggle_df = data.frame(test_set$Id,prediction_data$SalePrice)
kaggle_df= kaggle_df %>% fill(prediction_data.SalePrice)

#write.csv(kaggle_df, file = "final_Model_Prediction_Project.csv", row.names = FALSE)

USERNAME: manonfire86

SCORE: 0.38709