library(gridExtra)
library(RColorBrewer)
library(Matrix)
library(scales)
library(corrplot)
library(MASS)
library(psych)
library(ggplot2)
library(matlib)
library(dplyr)
library(tidyr)
library(kableExtra)
library(purrr)
library(Hmisc)

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of\[\mu=\sigma=(N+1)/2\].
a. P(X>x | X>y) b. P(X>x, Y>y) c. P(X<x | X>y)
Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities. Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

set.seed(666)
N <- 10
n <- 10000


X <- runif(10000, 1, N)


Y <- rnorm(10000, (N+1)/2, (N+1)/2)

Probability

Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable.

x <- median(X)
x
## [1] 5.552747
y <- quantile(Y, 0.25)
y
##      25% 
## 1.873373

a. P(X>x | X>y)

pAll<-sum(X>x & X>y)/n 
pXy<-sum(X>y)/n 


p1=pAll/pXy

round(p1,2)
## [1] 0.55

The probability is 0.55 or 55%.

b. P(X>x, Y>y)

p2<-(sum(X>x & Y>y))/n
round(p2,2)
## [1] 0.38

c. P(X<x | X>y)

p3<-sum(X<x & X>y)/n
round(p3,2)
## [1] 0.41

Next Independence testing

Investigate whether P(X>x and Y>y) = P(X > x) * P(Y > y) by building a table and evaluating the marginal and joint probabilities.

m<-matrix( c(sum(X>x & Y<y),sum(X>x & Y>y), sum(X<x & Y<y),sum(X<x & Y>y)), nrow = 2,ncol = 2)
m<-cbind(m,c(m[1,1]+m[1,2],m[2,1]+m[2,2]))
m<-rbind(m,c(m[1,1]+m[2,1],m[1,2]+m[2,2],m[1,3]+m[2,3]))


df<-as.data.frame(m)
names(df) <- c("X>x","X<x", "Total")
row.names(df) <- c("Y<y","Y>y", "Total")
kable(df)%>%
  kable_styling(bootstrap_options = "bordered")
X>x X<x Total
Y<y 1250 1250 2500
Y>y 3750 3750 7500
Total 5000 5000 10000
pm<-m/m[3,3]
dfp<-as.data.frame(pm)
names(dfp) <- c("X>x","X<x", "Total")
row.names(dfp) <- c("Y<y","Y>y", "Total")
kable(round(dfp,2)) %>%
  kable_styling(bootstrap_options = "bordered")
X>x X<x Total
Y<y 0.12 0.12 0.25
Y>y 0.38 0.38 0.75
Total 0.50 0.50 1.00

Calculating

#P(X>x)P(Y>y)
p1<-pm[3,1]*pm[2,3]
p1
## [1] 0.375
#P(X>x and Y>y)
p2<-round(pm[2,1],digits = 3)
p2
## [1] 0.375
#P(X>x and Y>y)=P(X>x)P(Y>y)
p1==p2
## [1] TRUE
chisq.test(m, correct=TRUE)
## 
##  Pearson's Chi-squared test
## 
## data:  m
## X-squared = 0, df = 4, p-value = 1
fisher.test(m, simulate.p.value=TRUE)
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  m
## p-value = 1
## alternative hypothesis: two.sided

We use the chi square test is used when the cell sizes are large, which would be appropriate to use in this case.

Problem Two

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

Descriptive and Inferential Statistics

train <- read.csv('https://raw.githubusercontent.com/hrensimin05/Data605_FinalProject/main/train.csv')
test<-read.csv('https://raw.githubusercontent.com/hrensimin05/Data605_FinalProject/main/test.csv')

dim(train)
## [1] 1460   81
summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

Variables

ggplot(train, aes(x = YearBuilt, y = SalePrice)) +
  geom_point()+
  geom_smooth(method=lm) +
  scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula 'y ~ x'

ggplot(train, aes(x = OverallQual, y = SalePrice)) +
  geom_point()+
  geom_smooth(method=lm) +
  scale_y_continuous(labels = scales::comma)+coord_flip()
## `geom_smooth()` using formula 'y ~ x'

ggplot(train, aes(x = Neighborhood, y = SalePrice)) +
  geom_point()+
  geom_smooth(method=lm) +
  scale_y_continuous(labels = scales::comma)+ coord_flip()
## `geom_smooth()` using formula 'y ~ x'

Correlation

data=select(train,YearBuilt,OverallQual,SalePrice)

mat=cor(data)


corrplot(mat,method ="color")

Hypothesis Testing

SalePrice and YearBuilt

cor.test(train$SalePrice,train$YearBuilt, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  train$SalePrice and train$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4980766 0.5468619
## sample estimates:
##       cor 
## 0.5228973

SalePrice and OverallQual

cor.test(train$SalePrice,train$OverallQual, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  train$SalePrice and train$OverallQual
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7780752 0.8032204
## sample estimates:
##       cor 
## 0.7909816

SalePrice and FullBath

cor.test(train$SalePrice,train$FullBath, conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  train$SalePrice and train$FullBath
## t = 25.854, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5372107 0.5832505
## sample estimates:
##       cor 
## 0.5606638

We can see from above examples of variables that the correlation is not equal to 0 and with 80 percent confidence that there is correlation of 0.5 and 0.55, 0.78 and 0.8, and 0.54 and 0.58 respectively.

The familywise error rate (FWE or FWER) is the probability of a coming to at least one false conclusion in a series of hypothesis tests.I would not worried about it in our case due to the p-value > 0.05 which means that the p-value works as an alternate for the rejections point as they provide the smallest level of significance under which the null hypothesis is not true.

Linear Algebra and Correlation.

pmatrix <- solve(mat)
print(pmatrix)
##              YearBuilt OverallQual  SalePrice
## YearBuilt    1.5168013  -0.6431116 -0.2844419
## OverallQual -0.6431116   2.9439846 -1.9923563
## SalePrice   -0.2844419  -1.9923563  2.7246511
round(mat %*% pmatrix,4)
##             YearBuilt OverallQual SalePrice
## YearBuilt           1           0         0
## OverallQual         0           1         0
## SalePrice           0           0         1
pcmat<-round(pmatrix %*% mat,4)
pcmat
##             YearBuilt OverallQual SalePrice
## YearBuilt           1           0         0
## OverallQual         0           1         0
## SalePrice           0           0         1

The precision matrix is an inverse of the correlation matrix, multiplying them in either direction gives us an identity matrix.

LU decomposition

Correlation Matrix

library(matrixcalc)
## Warning: package 'matrixcalc' was built under R version 4.0.3
## 
## Attaching package: 'matrixcalc'
## The following object is masked from 'package:matlib':
## 
##     vec
lu.decomposition(pcmat)
## $L
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1
## 
## $U
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

Calculus-Based Probability & Statistic

z <- train$TotalBsmtSF 

min(z)
## [1] 0
hist(z)

Then load the MASS package and run fitdistr to fit an exponential probability density function.

fit <-fitdistr(z, densfun = "exponential")
fit
##        rate    
##   9.456896e-04 
##  (2.474983e-05)
fit$estimate
##         rate 
## 0.0009456896
sample<-rexp(1000, fit$estimate)
par(mfrow=c(1,2))
hist(z, breaks = 100, xlab = "Observed", main = "Observed")
hist(sample, breaks = 100, xlab = "Simulated", main = "Simulated")

We can notice from the histograms that the simulated data is more heavily skewed to the right while the observed data is more concentrated to the center.

quantile(sample, probs = c(0.05, 0.95))
##         5%        95% 
##   52.02832 3197.44850
#lower
mean(train$TotalBsmtSF ) - qnorm(0.95) * sd(train$TotalBsmtSF) / sqrt(length(train$TotalBsmtSF))
## [1] 1038.544
#upper
mean(train$TotalBsmtSF ) + qnorm(0.95) * sd(train$TotalBsmtSF ) / sqrt(length(train$TotalBsmtSF ))
## [1] 1076.315
quantile(train$TotalBsmtSF , probs=c(.05,.95))
##     5%    95% 
##  519.3 1753.0

For TotalBsmtSF, 95% CI is 1038 < X < 1076.

5th percentile is 519 and 95th percentile is 1753.

Modeling

Training Data and Model Generation

dt <- sapply(train, is.numeric)
dt_df <- train[ , dt]

head(dt_df)
##   Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## 1  1         60          65    8450           7           5      2003
## 2  2         20          80    9600           6           8      1976
## 3  3         60          68   11250           7           5      2001
## 4  4         70          60    9550           7           5      1915
## 5  5         60          84   14260           8           5      2000
## 6  6         50          85   14115           5           5      1993
##   YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF X1stFlrSF
## 1         2003        196        706          0       150         856       856
## 2         1976          0        978          0       284        1262      1262
## 3         2002        162        486          0       434         920       920
## 4         1970          0        216          0       540         756       961
## 5         2000        350        655          0       490        1145      1145
## 6         1995          0        732          0        64         796       796
##   X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1       854            0      1710            1            0        2        1
## 2         0            0      1262            0            1        2        0
## 3       866            0      1786            1            0        2        1
## 4       756            0      1717            1            0        1        0
## 5      1053            0      2198            1            0        2        1
## 6       566            0      1362            1            0        1        1
##   BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 1            3            1            8          0        2003          2
## 2            3            1            6          1        1976          2
## 3            3            1            6          1        2001          2
## 4            3            1            7          1        1998          3
## 5            4            1            9          1        2000          3
## 6            1            1            5          0        1993          2
##   GarageArea WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 1        548          0          61             0          0           0
## 2        460        298           0             0          0           0
## 3        608          0          42             0          0           0
## 4        642          0          35           272          0           0
## 5        836        192          84             0          0           0
## 6        480         40          30             0        320           0
##   PoolArea MiscVal MoSold YrSold SalePrice
## 1        0       0      2   2008    208500
## 2        0       0      5   2007    181500
## 3        0       0      9   2008    223500
## 4        0       0      2   2006    140000
## 5        0       0     12   2008    250000
## 6        0     700     10   2009    143000
#Find correlation for Sale Prices

c_prices <-data.frame(apply(dt_df,2, function(col)cor(col, dt_df$SalePrice, use = "complete.obs")))
colnames(c_prices) <- c("cor")
c_prices
##                       cor
## Id            -0.02191672
## MSSubClass    -0.08428414
## LotFrontage    0.35179910
## LotArea        0.26384335
## OverallQual    0.79098160
## OverallCond   -0.07785589
## YearBuilt      0.52289733
## YearRemodAdd   0.50710097
## MasVnrArea     0.47749305
## BsmtFinSF1     0.38641981
## BsmtFinSF2    -0.01137812
## BsmtUnfSF      0.21447911
## TotalBsmtSF    0.61358055
## X1stFlrSF      0.60585218
## X2ndFlrSF      0.31933380
## LowQualFinSF  -0.02560613
## GrLivArea      0.70862448
## BsmtFullBath   0.22712223
## BsmtHalfBath  -0.01684415
## FullBath       0.56066376
## HalfBath       0.28410768
## BedroomAbvGr   0.16821315
## KitchenAbvGr  -0.13590737
## TotRmsAbvGrd   0.53372316
## Fireplaces     0.46692884
## GarageYrBlt    0.48636168
## GarageCars     0.64040920
## GarageArea     0.62343144
## WoodDeckSF     0.32441344
## OpenPorchSF    0.31585623
## EnclosedPorch -0.12857796
## X3SsnPorch     0.04458367
## ScreenPorch    0.11144657
## PoolArea       0.09240355
## MiscVal       -0.02118958
## MoSold         0.04643225
## YrSold        -0.02892259
## SalePrice      1.00000000
(subset(c_prices, cor > 0.5))
##                    cor
## OverallQual  0.7909816
## YearBuilt    0.5228973
## YearRemodAdd 0.5071010
## TotalBsmtSF  0.6135806
## X1stFlrSF    0.6058522
## GrLivArea    0.7086245
## FullBath     0.5606638
## TotRmsAbvGrd 0.5337232
## GarageCars   0.6404092
## GarageArea   0.6234314
## SalePrice    1.0000000
model <- lm(SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + GarageCars + GarageArea, data = train)

summary(model)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + 
##     TotalBsmtSF + X1stFlrSF + GrLivArea + FullBath + TotRmsAbvGrd + 
##     GarageCars + GarageArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489958  -19316   -1948   16020  290558 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.186e+06  1.291e+05  -9.187  < 2e-16 ***
## OverallQual   1.960e+04  1.190e+03  16.472  < 2e-16 ***
## YearBuilt     2.682e+02  5.035e+01   5.328 1.15e-07 ***
## YearRemodAdd  2.965e+02  6.363e+01   4.659 3.47e-06 ***
## TotalBsmtSF   1.986e+01  4.295e+00   4.625 4.09e-06 ***
## X1stFlrSF     1.417e+01  4.930e+00   2.875 0.004097 ** 
## GrLivArea     5.130e+01  4.233e+00  12.119  < 2e-16 ***
## FullBath     -6.791e+03  2.682e+03  -2.532 0.011457 *  
## TotRmsAbvGrd  3.310e+01  1.119e+03   0.030 0.976404    
## GarageCars    1.042e+04  3.044e+03   3.422 0.000639 ***
## GarageArea    1.495e+01  1.031e+01   1.450 0.147384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared:  0.7737, Adjusted R-squared:  0.7721 
## F-statistic: 495.4 on 10 and 1449 DF,  p-value: < 2.2e-16

So the R^2 value of 0.7737, 77.37% of the variance can be explained by this model.

myprediction <- predict(model,test)


DMDmodel <- data.frame( Id = test[,"Id"],  SalePrice = myprediction)
DMDmodel[DMDmodel<0] <- 0
DMDmodel <- replace(DMDmodel,is.na(DMDmodel),0)
  
head(DMDmodel)
##     Id SalePrice
## 1 1461  110135.9
## 2 1462  159060.0
## 3 1463  169683.7
## 4 1464  188059.7
## 5 1465  219782.0
## 6 1466  182152.0
write.csv(DMDmodel, file="DMDmodel.csv", row.names = FALSE)

The resulting multivariate model explains 77.37% of the data with statistically significat p-values for the choosen variables. The residual standard error, the standard deviation of the residuals, is 37920 on 1449 degrees of freedom, so the predicted price will deviate from the actual price by a mean of 37920.

Kaggle Submission - Score

Report your Kaggle.com user name and score.

Kaggle Username: hrensimin05

Kaggle Score: 0.79801

See submission documents : https://github.com/hrensimin05/Data605_FinalProject https://rpubs.com/hrensimin05/774107