Link to Rpubs

Problem 1.

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \({\displaystyle \mu }={\displaystyle \sigma }=(N+1)/2\).

#set seed
set.seed(12345)
N <- 19
n <- 10000
mu <- sigma <- (N + 1)/2

# generate a random variable X that has 10,000 random uniform numbers from 1 to N

df <- data.frame(X = runif(n, min = 1, max = N), Y = rnorm(n, mean = mu, sd = sigma))
summary(df$X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.001   5.566  10.082  10.012  14.485  18.999
#Plot X
hist(df$X, breaks = 10, col = "skyblue3")

summary(df$Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -26.662   3.357   9.967  10.000  16.643  48.665
#Plot Y
hist(df$Y, breaks = 10, col = "skyblue3")

From the charts we can see that y is closer to normal distribution.

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

#lets get x and y
x <- median(df$X)
cat('small_x: ',x)
## small_x:  10.0819
y <- quantile(df$Y, 0.25)
cat('small_y: ',y)
## small_y:  3.356501

a. P(X>x | X>y)

P(A|B) Probability of A occuring while B has already occurred.

# a. P(X>x | X>y)   
p1 = df %>% filter(X > x, X > y) %>% nrow()/n

p2 = df %>% filter(X > y) %>% nrow()/n

prob_a = round(p1/p2,4)
cat('The probability of (a): ',prob_a)
## The probability of (a):  0.5742

b. P(X>x, Y>y)

Probability of A > B

prob_b = df %>% filter(X > x, Y > y) %>% nrow()/n

cat('The probability of (b): ',prob_b)
## The probability of (b):  0.3808

c. P(X<x | X>y)

p3 = df %>% filter(X < x, X > y) %>% nrow()/n

p4 = df %>% filter(X > y) %>% nrow()/n

prob_c = round(p3 / p4,4)
cat('The probability of (b): ',prob_c)
## The probability of (b):  0.4258

Investigate

whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

# Joint Probability
prob = df %>% 
  mutate(A = ifelse(X > x, " X > x", " X < x"), B = ifelse(Y > y, " Y > y", " Y < y")) %>% 
  group_by(A, B) %>%
  dplyr::summarise(count = n()) %>%
  mutate(probability = count/n)

# Marginal - Probability
prob = prob %>% 
  ungroup() %>% 
  group_by(A) %>% 
  summarize(count = sum(count), probability = sum(probability)) %>% 
  mutate(B = "Total") %>% 
  bind_rows(prob)

prob = prob %>% 
  ungroup() %>% 
  group_by(B) %>% 
  summarize(count = sum(count), probability = sum(probability)) %>% 
  mutate(A = "Total") %>% bind_rows(prob)

# Creating table
prob %>% select(-count) %>% 
  spread(A, probability) %>% 
  dplyr::rename(` ` = B) %>%
  kable() %>% 
  kable_styling()

Test

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

# Joint Probability
prob = df %>% 
  mutate(A = ifelse(X > x, " X > x", " X < x"), B = ifelse(Y > y, " Y > y", " Y < y")) %>% 
  group_by(A, B) %>%
  dplyr::summarise(count = n()) %>%
  mutate(probability = count/n)

# Marginal - Probability
prob = prob %>% 
  ungroup() %>% 
  group_by(A) %>% 
  summarize(count = sum(count), probability = sum(probability)) %>% 
  mutate(B = "Total") %>% 
  bind_rows(prob)

prob = prob %>% 
  ungroup() %>% 
  group_by(B) %>% 
  summarize(count = sum(count), probability = sum(probability)) %>% 
  mutate(A = "Total") %>% bind_rows(prob)


count_data = prob %>% filter(A != "Total", B != "Total") %>% select(-probability) %>% 
    spread(A, count) %>% as.data.frame()

row.names(count_data) = count_data$B

count_data = count_data %>% 
  select(-B) %>% 
  as.matrix()

# Using Fisher's Exact Test
fisher.test(count_data)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  count_data
## p-value = 0.007904
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.032680 1.240419
## sample estimates:
## odds ratio 
##   1.131777
# Using Chi Square Test
chisq.test(count_data)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  count_data
## X-squared = 7.0533, df = 1, p-value = 0.007912

P-value in Fisher’s exact test is 0.007904 while the P-value using the Chi Square is 0.007912, both of the tests have a close P-value, in Fisher’s test the confidence interval contains one so we fail to reject the null hypothesis. With Chi Square the sample is random so we can reject the null hypothesis where p-vlaue less than signigicance level. that these are independent. since the sample size is high for fisher I think chi-square works better.

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques.

traindata <- read.csv('train.csv')

testdata <- read.csv('test.csv')

data = function(df) {
    df %>% # Lets replace all of N/A's
    mutate(BedroomAbvGr = replace_na(BedroomAbvGr, 0), BsmtFullBath = replace_na(BsmtFullBath, 
        0), BsmtHalfBath = replace_na(BsmtHalfBath, 0), BsmtUnfSF = replace_na(BsmtUnfSF, 
        0), EnclosedPorch = replace_na(EnclosedPorch, 0), Fireplaces = replace_na(Fireplaces, 
        0), GarageArea = replace_na(GarageArea, 0), GarageCars = replace_na(GarageCars, 
        0), HalfBath = replace_na(HalfBath, 0), KitchenAbvGr = replace_na(KitchenAbvGr, 
        0), LotFrontage = replace_na(LotFrontage, 0), OpenPorchSF = replace_na(OpenPorchSF, 
        0), PoolArea = replace_na(PoolArea, 0), ScreenPorch = replace_na(ScreenPorch, 
        0), TotRmsAbvGrd = replace_na(TotRmsAbvGrd, 0), WoodDeckSF = replace_na(WoodDeckSF, 
        0))
}

traindata = data(traindata)
traindata %>%
  select_if(is.numeric) %>% # filter numeric var
  summary()
##        Id           MSSubClass     LotFrontage        LotArea      
##  Min.   :   1.0   Min.   : 20.0   Min.   :  0.00   Min.   :  1300  
##  1st Qu.: 365.8   1st Qu.: 20.0   1st Qu.: 42.00   1st Qu.:  7554  
##  Median : 730.5   Median : 50.0   Median : 63.00   Median :  9478  
##  Mean   : 730.5   Mean   : 56.9   Mean   : 57.62   Mean   : 10517  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   3rd Qu.: 79.00   3rd Qu.: 11602  
##  Max.   :1460.0   Max.   :190.0   Max.   :313.00   Max.   :215245  
##                                                                    
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd 
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010  
##                                                                
##    MasVnrArea       BsmtFinSF1       BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   Min.   :   0.0   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 223.0  
##  Median :   0.0   Median : 383.5   Median :   0.00   Median : 477.5  
##  Mean   : 103.7   Mean   : 443.6   Mean   :  46.55   Mean   : 567.2  
##  3rd Qu.: 166.0   3rd Qu.: 712.2   3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Max.   :1600.0   Max.   :5644.0   Max.   :1474.00   Max.   :2336.0  
##  NA's   :8                                                           
##   TotalBsmtSF       X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  Min.   :   0.0   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  1st Qu.: 795.8   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Median : 991.5   Median :1087   Median :   0   Median :  0.000  
##  Mean   :1057.4   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  3rd Qu.:1298.2   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  Max.   :6110.0   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                                  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr    TotRmsAbvGrd   
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 5.000  
##  Median :0.0000   Median :3.000   Median :1.000   Median : 6.000  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   Mean   : 6.518  
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.: 7.000  
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000   Max.   :14.000  
##                                                                   
##    Fireplaces     GarageYrBlt     GarageCars      GarageArea    
##  Min.   :0.000   Min.   :1900   Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:1961   1st Qu.:1.000   1st Qu.: 334.5  
##  Median :1.000   Median :1980   Median :2.000   Median : 480.0  
##  Mean   :0.613   Mean   :1979   Mean   :1.767   Mean   : 473.0  
##  3rd Qu.:1.000   3rd Qu.:2002   3rd Qu.:2.000   3rd Qu.: 576.0  
##  Max.   :3.000   Max.   :2010   Max.   :4.000   Max.   :1418.0  
##                  NA's   :81                                     
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      X3SsnPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##                                                                     
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 5.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 15.06   Mean   :  2.759   Mean   :   43.49   Mean   : 6.322  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :480.00   Max.   :738.000   Max.   :15500.00   Max.   :12.000  
##                                                                        
##      YrSold       SalePrice     
##  Min.   :2006   Min.   : 34900  
##  1st Qu.:2007   1st Qu.:129975  
##  Median :2008   Median :163000  
##  Mean   :2008   Mean   :180921  
##  3rd Qu.:2009   3rd Qu.:214000  
##  Max.   :2010   Max.   :755000  
## 

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

I selected the following independent variables LotArea, GrLivArea, GarageArea and SalePrice as a dependent variable.

# LotArea
summary(traindata$LotArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245
hist(traindata$LotArea, xlab = "Lot Area", main = "Lot Area", col = "red")

# LotArea with Sales price
ggplot(traindata, aes(LotArea, SalePrice))+
  geom_point(colour = 'skyblue', alpha = 0.3) +
  theme_minimal()

# GrLivArea
summary(traindata$GrLivArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
hist(traindata$GrLivArea, xlab = "GrLivArea", main =  "GrLivArea", col = "red")

# GrLivArea with Sales price
ggplot(traindata, aes(GrLivArea, SalePrice))+
  geom_point(colour = 'skyblue', alpha = 0.3) +
  theme_minimal()

# GarageArea
summary(traindata$GarageArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   334.5   480.0   473.0   576.0  1418.0
hist(traindata$GarageArea, xlab = "Garage Area", main = "Garage Area", col = "red")

# Garage Area with Sales price
ggplot(traindata, aes(GarageArea, SalePrice))+
  geom_point(colour = 'skyblue', alpha = 0.3) +
  theme_minimal()

Dependent variable SalePrice

summary(traindata$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
hist(traindata$SalePrice, xlab = "Sale Price", main = "Sale Price", col = "red")

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

Scatterplot matrix for LotArea, GrLivArea, GarageArea and SalePrice.

mat_train = traindata %>%
  dplyr::select(LotArea, GrLivArea, GarageArea, SalePrice) 

pairs(mat_train[,0:4], pch = 19)

Derive a correlation matrix for any three quantitative variables in the dataset.

cor_metrix = mat_train  %>%
  cor() %>%
  as.matrix()
cor_metrix
##              LotArea GrLivArea GarageArea SalePrice
## LotArea    1.0000000 0.2631162  0.1804028 0.2638434
## GrLivArea  0.2631162 1.0000000  0.4689975 0.7086245
## GarageArea 0.1804028 0.4689975  1.0000000 0.6234314
## SalePrice  0.2638434 0.7086245  0.6234314 1.0000000
cor_metrix %>% corrplot(method = 'number')

From the grid we can see that the GrLivArea has the highest correlation, while the Lot Area has the lowest correlation.

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval

Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

cor.test(traindata$LotArea, traindata$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  traindata$LotArea and traindata$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434
cor.test(traindata$GrLivArea, traindata$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  traindata$GrLivArea and traindata$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245
cor.test(traindata$GarageArea, traindata$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  traindata$GarageArea and traindata$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314

The correlation of Lot area and sales price is 0.2638 and p-value less than 0.05, the correlation of GrLivArea and sales price is 0.7086, and the correlation of Garage Area and sales price is 0.6234. While the p-value <2.2e-16 which is is statistically significant small therefore we reject the null hypothesis that the correlations above are 0, and the independent variables and the dependent one have correlations.

Because there are so many variables in teh train dataset then ypes I would worry about the correletion between peirwise variables I tested. so its possible to reject a true null hypothesis unless all of the other veriabels are taken into account.

Linear Algebra and Correlation.

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Correlation matrix

# Correlation matrix

cor_metrix = mat_train  %>%
  cor() %>%
  round(3)

cor_metrix
##            LotArea GrLivArea GarageArea SalePrice
## LotArea      1.000     0.263      0.180     0.264
## GrLivArea    0.263     1.000      0.469     0.709
## GarageArea   0.180     0.469      1.000     0.623
## SalePrice    0.264     0.709      0.623     1.000

Precision matrix

# Precision matrix

precision_matrix = solve(cor_metrix) %>%
  round(3)
  

precision_matrix
##            LotArea GrLivArea GarageArea SalePrice
## LotArea      1.089    -0.165     -0.020    -0.158
## GrLivArea   -0.165     2.041     -0.087    -1.349
## GarageArea  -0.020    -0.087      1.639    -0.954
## SalePrice   -0.158    -1.349     -0.954     2.593

when multiplying the correlation matrix precision matrix we get identity matrix.

cor_metrix %*% precision_matrix %>%
  round(3)
##            LotArea GrLivArea GarageArea SalePrice
## LotArea          1         0          0     0.000
## GrLivArea        0         1          0     0.000
## GarageArea       0         0          1     0.000
## SalePrice        0         0          0     1.001

And same when multiplying the precision matrix with correlation matrix we get identity matrix.

precision_matrix %*% cor_metrix %>%
  round(3)
##            LotArea GrLivArea GarageArea SalePrice
## LotArea          1         0          0     0.000
## GrLivArea        0         1          0     0.000
## GarageArea       0         0          1     0.000
## SalePrice        0         0          0     1.001

Conduct LU decomposition on the matrix.

decomposition = lu.decomposition(cor_metrix)
decomposition
## $L
##       [,1]      [,2]      [,3] [,4]
## [1,] 1.000 0.0000000 0.0000000    0
## [2,] 0.263 1.0000000 0.0000000    0
## [3,] 0.180 0.4529931 1.0000000    0
## [4,] 0.264 0.6870936 0.3679674    1
## 
## $U
##      [,1]     [,2]      [,3]      [,4]
## [1,]    1 0.263000 0.1800000 0.2640000
## [2,]    0 0.930831 0.4216600 0.6395680
## [3,]    0 0.000000 0.7765909 0.2857601
## [4,]    0 0.000000 0.0000000 0.3857105
# Correlation metrix
cor_metrix
##            LotArea GrLivArea GarageArea SalePrice
## LotArea      1.000     0.263      0.180     0.264
## GrLivArea    0.263     1.000      0.469     0.709
## GarageArea   0.180     0.469      1.000     0.623
## SalePrice    0.264     0.709      0.623     1.000

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

I selected the GrLivArea and it is right skewed

# checking the minimum value
summary(traindata$GrLivArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
par(mfrow=c(1,2))
hist(traindata$GrLivArea, main="Hist GrLivArea", col = "skyblue")
boxplot(traindata$GrLivArea, main="Boxplot GrLivArea", col = "skyblue")

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

fit_exp = fitdistr(traindata$GrLivArea, "exponential")
fit_exp
##        rate    
##   6.598640e-04 
##  (1.726943e-05)

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

# Find the optimal value of `λ` for this distribution
lambda = round(fit_exp$estimate, 8)

# Take 1000 samples from this exponential distribution using this value
set.seed(23)
sample_exp = rexp(1000, lambda)
head(sample_exp,20)
##  [1]  232.18155 2449.53103  968.23252 2062.59717 1032.10341  330.64852
##  [7] 1048.54754 2274.00115   54.92248  283.54750  615.97914 1200.42286
## [13] 2375.57976  913.55840 1877.34744  971.26874  560.53488  117.56671
## [19]  366.77755 1196.84939

Plot a histogram and compare it with a histogram of your original variable.

par(mfrow = c(1, 2))

hist(traindata$GrLivArea, breaks = 50, main = "The Original GrLivArea", col = "skyblue")

hist(sample_exp, breaks = 50, main = "Exponential GrLivArea", col = "red")

From the plots above we can see the Exponential GrLivArea. is more right skewed than the Original GrLivArea.

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

# The 5th percentile
qexp(0.05, rate = lambda)
## [1] 77.7336
# The 95th percentile
qexp(0.95, rate = lambda)
## [1] 4539.951

Also generate a 95% confidence interval from the empirical data, assuming normality.

CI(traindata$GrLivArea, 0.95)
##    upper     mean    lower 
## 1542.440 1515.464 1488.487

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(traindata$GrLivArea, c(.05, .95))
##     5%    95% 
##  848.0 2466.1

With 95% confidence the mean is between 1488.487 and 1542.440, it doesn’t seem the exponential data is a good fit from the plots and confidence interval above compared to the empirical data.

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.

# Build multiple regression model for numeric variables

m1 <- lm(SalePrice ~  OverallQual+YearBuilt+YearRemodAdd+MasVnrArea+BsmtFinSF1+TotalBsmtSF+X1stFlrSF+X2ndFlrSF+GrLivArea+FullBath+TotRmsAbvGrd+Fireplaces+GarageCars+GarageArea+WoodDeckSF+OpenPorchSF ,data = traindata)

summary(m1)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + YearRemodAdd + 
##     MasVnrArea + BsmtFinSF1 + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + 
##     GrLivArea + FullBath + TotRmsAbvGrd + Fireplaces + GarageCars + 
##     GarageArea + WoodDeckSF + OpenPorchSF, data = traindata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -512233  -17548   -1737   14681  283280 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.094e+06  1.268e+05  -8.627  < 2e-16 ***
## OverallQual   1.856e+04  1.174e+03  15.807  < 2e-16 ***
## YearBuilt     1.638e+02  4.978e+01   3.290 0.001028 ** 
## YearRemodAdd  3.564e+02  6.208e+01   5.741 1.15e-08 ***
## MasVnrArea    2.881e+01  6.159e+00   4.678 3.17e-06 ***
## BsmtFinSF1    1.725e+01  2.596e+00   6.646 4.26e-11 ***
## TotalBsmtSF   1.165e+01  4.298e+00   2.711 0.006796 ** 
## X1stFlrSF     2.618e+01  2.082e+01   1.257 0.208871    
## X2ndFlrSF     1.753e+01  2.048e+01   0.856 0.392000    
## GrLivArea     2.135e+01  2.035e+01   1.049 0.294370    
## FullBath     -1.489e+03  2.630e+03  -0.566 0.571228    
## TotRmsAbvGrd  1.688e+03  1.089e+03   1.550 0.121402    
## Fireplaces    7.888e+03  1.783e+03   4.423 1.05e-05 ***
## GarageCars    1.011e+04  2.960e+03   3.414 0.000659 ***
## GarageArea    1.040e+01  1.005e+01   1.035 0.301006    
## WoodDeckSF    3.068e+01  8.129e+00   3.774 0.000167 ***
## OpenPorchSF   7.271e+00  1.572e+01   0.462 0.643861    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36380 on 1435 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7918, Adjusted R-squared:  0.7894 
## F-statistic:   341 on 16 and 1435 DF,  p-value: < 2.2e-16

Based on the signigicant level lets eleminate few variables

m2 <- lm(SalePrice ~  OverallQual+YearRemodAdd+MasVnrArea+BsmtFinSF1+TotalBsmtSF+Fireplaces+GarageCars+WoodDeckSF ,data = traindata)

summary(m2)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + YearRemodAdd + MasVnrArea + 
##     BsmtFinSF1 + TotalBsmtSF + Fireplaces + GarageCars + WoodDeckSF, 
##     data = traindata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -407840  -21443   -2760   16410  363961 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.307e+05  1.210e+05  -6.867 9.70e-12 ***
## OverallQual   2.449e+04  1.183e+03  20.706  < 2e-16 ***
## YearRemodAdd  3.925e+02  6.256e+01   6.273 4.66e-10 ***
## MasVnrArea    4.651e+01  6.602e+00   7.045 2.85e-12 ***
## BsmtFinSF1    1.482e+01  2.752e+00   5.383 8.52e-08 ***
## TotalBsmtSF   2.504e+01  3.290e+00   7.611 4.89e-14 ***
## Fireplaces    1.551e+04  1.849e+03   8.389  < 2e-16 ***
## GarageCars    1.794e+04  1.820e+03   9.855  < 2e-16 ***
## WoodDeckSF    4.464e+01  8.848e+00   5.045 5.12e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39960 on 1443 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7474, Adjusted R-squared:  0.746 
## F-statistic: 533.6 on 8 and 1443 DF,  p-value: < 2.2e-16
#hist
hist(m2$residuals,breaks = 200, col = 'skyblue')

#QQ plot
qqnorm(m2$residuals)
qqline(m2$residuals)

From the plot it looks more normal distributed with outliers.

Now lets check the test data.

testdata[complete.cases(testdata),]
##  [1] Id            MSSubClass    MSZoning      LotFrontage   LotArea      
##  [6] Street        Alley         LotShape      LandContour   Utilities    
## [11] LotConfig     LandSlope     Neighborhood  Condition1    Condition2   
## [16] BldgType      HouseStyle    OverallQual   OverallCond   YearBuilt    
## [21] YearRemodAdd  RoofStyle     RoofMatl      Exterior1st   Exterior2nd  
## [26] MasVnrType    MasVnrArea    ExterQual     ExterCond     Foundation   
## [31] BsmtQual      BsmtCond      BsmtExposure  BsmtFinType1  BsmtFinSF1   
## [36] BsmtFinType2  BsmtFinSF2    BsmtUnfSF     TotalBsmtSF   Heating      
## [41] HeatingQC     CentralAir    Electrical    X1stFlrSF     X2ndFlrSF    
## [46] LowQualFinSF  GrLivArea     BsmtFullBath  BsmtHalfBath  FullBath     
## [51] HalfBath      BedroomAbvGr  KitchenAbvGr  KitchenQual   TotRmsAbvGrd 
## [56] Functional    Fireplaces    FireplaceQu   GarageType    GarageYrBlt  
## [61] GarageFinish  GarageCars    GarageArea    GarageQual    GarageCond   
## [66] PavedDrive    WoodDeckSF    OpenPorchSF   EnclosedPorch X3SsnPorch   
## [71] ScreenPorch   PoolArea      PoolQC        Fence         MiscFeature  
## [76] MiscVal       MoSold        YrSold        SaleType      SaleCondition
## <0 rows> (or 0-length row.names)
#predicting
pred <- predict(m1,testdata)

#kaggle Score
kaggle_score <- data.frame( Id = testdata[,"Id"],  SalePrice =pred)
kaggle_score[kaggle_score<0] <- 0
kaggle_score <- replace(kaggle_score,is.na(kaggle_score),0)
write.csv(kaggle_score, file="kaggle_score.csv", row.names = FALSE)

Report your Kaggle.com user name and score. Kaggle Score

Link

Link to Rpubs