Problem 1

Pick one of the quantitative independent variables (Xi) from the data set below, and define that variable as X. Also, pick one of the dependent variables (Yi) below, and define that as Y.

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

Y1 = c(20.3,19.1,19.3,20.9,22.0,23.5,13.8,18.8,20.9,18.6,22.3,17.6,20.8,28.7,15.2,20.9,18.4,10.3,26.3,28.1)
X1 = c(9.3,4.1,22.4,9.1,15.8,7.1,15.9,6.9,16.0,6.7,8.2,16.0,6.4,11.8,3.5,21.7,12.2,9.3,8.0,6.2)

#Get summary statistics
summary(Y1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.30   18.55   20.55   20.29   22.07   28.70
summary(X1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.50    6.85    9.20   10.83   15.82   22.40
#Create table
table = data.frame(X1,Y1)


#Get counts of Y 1q (bigger) and X 3q (smaller or bigger)
Y_1q_b_cnt = nrow(subset(table, Y1 > 18.55))
X_3q_b_cnt = nrow(subset(table, X1 > 15.82))
X_3q_s_cnt = nrow(subset(table, X1 < 15.82))
total_cnt = nrow(table)

Y_1q_b = subset(table, Y1 > 18.55)

X_3q_b_given_Y_1q_b_cnt = nrow(subset(Y_1q_b, X1 > 15.82))
X_3q_s_given_Y_1q_b_cnt = nrow(subset(Y_1q_b, X1 < 15.82))

#Prob for X > 3q given Y > 1q
p_a = X_3q_b_given_Y_1q_b_cnt / Y_1q_b_cnt
p_a
## [1] 0.2
#Prob for X > 3q and Y > 1q
p_b = X_3q_b_given_Y_1q_b_cnt / total_cnt
p_b
## [1] 0.15
X_3q_b_AND_Y_1q_b_cnt = nrow(subset(table, X1 > 15.82 & Y1 > 18.55))


#Prob for X < 3q given y > 1q
p_c = X_3q_s_given_Y_1q_b_cnt / Y_1q_b_cnt
p_c
## [1] 0.8
#Note that P_c + P_a = 1 as P(X<x) and P(X>x) are opposite to one another given Y>y.

5 points. In addition, make a table of counts as shown below.

Y_3q_b = subset(table, Y1 > 18.55)
Y_3q_se = subset(table, Y1 <= 18.55)

X_1q_b_given_Y_3q_b_cnt = nrow(subset(Y_3q_b, X1 > 15.82))
X_1q_se_given_Y_3q_b_cnt = nrow(subset(Y_3q_b, X1 <= 15.82))

X_1q_b_given_Y_3q_se_cnt = nrow(subset(Y_3q_se, X1 > 15.82))
X_1q_se_given_Y_3q_se_cnt = nrow(subset(Y_3q_se, X1 <= 15.82))

#Creating matrix table
data2 = data.frame(X_1q_se_given_Y_3q_se_cnt,
                   X_1q_b_given_Y_3q_se_cnt,
                   X_1q_b_given_Y_3q_se_cnt + X_1q_se_given_Y_3q_se_cnt,
                   X_1q_se_given_Y_3q_b_cnt,
                   X_1q_b_given_Y_3q_b_cnt,
                   X_1q_b_given_Y_3q_b_cnt + X_1q_se_given_Y_3q_b_cnt,
                   X_1q_se_given_Y_3q_se_cnt + X_1q_se_given_Y_3q_b_cnt,
                   X_1q_b_given_Y_3q_se_cnt + X_1q_b_given_Y_3q_b_cnt,
                   total_cnt)
data2 = matrix(data2, nrow = 3, ncol = 3)
data2 = data.frame(data2)

rownames(data2) <- c('<=1st q for x', '>1st q for x', 'Total')
colnames(data2) <- c('<=3rd q for y', '>3rd q for y', 'Total')

data2
##               <=3rd q for y >3rd q for y Total
## <=1st q for x             3           12    15
## >1st q for x              2            3     5
## Total                     5           15    20

5 points. Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

X_1q_b_cnt = nrow(subset(table, X1 > 6.85))
Y_1q_b_cnt = nrow(subset(table, Y1 > 18.55))
X_1q_b_AND_Y_1q_b_cnt = nrow(subset(table, X1 > 6.85 & Y1 > 18.55))

P_AnB = X_1q_b_AND_Y_1q_b_cnt / total_cnt
P_A = (X_1q_b_cnt / total_cnt)
P_B = (Y_1q_b_cnt / total_cnt)
P_ATimesP_B = P_A * P_B

P_AnB
## [1] 0.55
P_ATimesP_B
## [1] 0.5625
#P(AB) is not equal to P(A)P(B). A and B are not independent.

# Chi-sq test
X_1q_b = subset(table, X1 > 6.85)
Y_1q_b = subset(table, Y1 > 18.55)

chisq.test(X_1q_b$X1, Y_1q_b$Y1)
## Warning in chisq.test(X_1q_b$X1, Y_1q_b$Y1): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  X_1q_b$X1 and Y_1q_b$Y1
## X-squared = 155, df = 144, p-value = 0.251
#chi square test shows that there is no relationship between A and B as p-value > 0.05.
#We do not reject null hypothesis which is independence between A and B.

Problem 2 You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. I want you to do the following.

5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

data = read.csv("https://raw.githubusercontent.com/wheremagichappens/an.dy/master/data605/train.csv")
indp = colnames(data[1:(length(colnames(data))-1)])
dep = colnames(data[81])

#Scatter Plot Matrics
library(psych)
pairs.panels(data[,c('SalePrice','TotalBsmtSF','GarageArea')], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = TRUE,  # show density plots
             ellipses = TRUE # show correlation ellipses
)

#CI 80% for each variable
cor.test(data$TotalBsmtSF, data$SalePrice, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  data$TotalBsmtSF and data$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806
#CI 80% = [0.59, 0.63]
cor.test(data$GarageArea, data$SalePrice, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  data$GarageArea and data$SalePrice
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314
#CI 80% = [0.60, 0.64]
cor.test(data$GarageArea, data$TotalBsmtSF, method = c("pearson", "kendall", "spearman"), conf.level = 0.8)
## 
##  Pearson's product-moment correlation
## 
## data:  data$GarageArea and data$TotalBsmtSF
## t = 21.272, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4606274 0.5118666
## sample estimates:
##       cor 
## 0.4866655
#CI 80% = [0.46, 0.51]

#P-value for each of case above is less than 0.05, we reject null hypothesis and accept alternative hypothesis that true correlation is not equal to 0 for each variable.

#familywise error rate = 1 - (1-alpha) = 1 - 0.8 = 0.2. 
#This means that the probability of a type I error is 20%, which is pretty high considering only 1 test was performed. I would worry about it a little bit.

5 points. Linear Algebra and Correlation. Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

#Inverting matrix and multiplying
library(matlib)
## 
## Attaching package: 'matlib'
## The following object is masked from 'package:psych':
## 
##     tr
A <- cor(data[, c('TotalBsmtSF','GarageArea', 'SalePrice')], method = c("pearson", "kendall", "spearman"))
B <- inv(A)

AB = A * B
BA = B * A
I = diag(nrow(A))
AB
##             TotalBsmtSF GarageArea  SalePrice
## TotalBsmtSF   1.6507677 -0.1368536 -0.5139141
## GarageArea   -0.1368536  1.6836724 -0.5468188
## SalePrice    -0.5139141 -0.5468188  2.0607329
BA
##                                      
## [1,]  1.6507677 -0.1368536 -0.5139141
## [2,] -0.1368536  1.6836724 -0.5468188
## [3,] -0.5139141 -0.5468188  2.0607329
I
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1
#Note that AB = BA but not equal to I.

#LU decomposition
library(Matrix)
mm <- Matrix(A)
str(lum <- lu(mm))
## Formal class 'denseLU' [package "Matrix"] with 4 slots
##   ..@ x       : num [1:9] 1 0.487 0.614 0.487 0.763 ...
##   ..@ perm    : int [1:3] 1 2 3
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:3] "TotalBsmtSF" "GarageArea" "SalePrice"
##   .. ..$ : chr [1:3] "TotalBsmtSF" "GarageArea" "SalePrice"
##   ..@ Dim     : int [1:2] 3 3
elu <- expand(lum)
elu # three components: "L", "U", and "P", the permutation
## $L
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000         .         .
## [2,] 0.4866655 1.0000000         .
## [3,] 0.6135806 0.4256308 1.0000000
## 
## $U
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000 0.4866655 0.6135806
## [2,]         . 0.7631567 0.3248230
## [3,]         .         . 0.4852643
## 
## $P
## 3 x 3 sparse Matrix of class "pMatrix"
##           
## [1,] | . .
## [2,] . | .
## [3,] . . |
elu$L %*% elu$U
## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.4866655 0.6135806
## [2,] 0.4866655 1.0000000 0.6234314
## [3,] 0.6135806 0.6234314 1.0000000
(m2 <- with(elu, P %*% L %*% U)) # the same as 'mm'
## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.4866655 0.6135806
## [2,] 0.4866655 1.0000000 0.6234314
## [3,] 0.6135806 0.6234314 1.0000000

5 points. Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

#From summary of data, we pick any variable that has median < mean.
summary(data)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   RH     :  16   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RL     :1151   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                  Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street      Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl:  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   Pave:  41   IR2: 41   HLS:  50   
##  Median :  9478               NA's:1369   IR3: 10   Low:  36   
##  Mean   : 10517                           Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                
##  Max.   :215245                                                
##                                                                
##   Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub:1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  NoSeWa:   1   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##                FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                FR3    :   4              Edwards:100   RRAn   :  26  
##                Inside :1052              Somerst: 86   PosN   :  19  
##                                          Gilbert: 79   RRAe   :  11  
##                                          (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual    
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000  
##  (Other):   2                 (Other): 19                   
##   OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                              
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   None   :864   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   Stone  :128   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   NA's   :  8   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation  BsmtQual   BsmtCond    BsmtExposure
##  Ex: 52    Ex:   3   BrkTil:146   Ex  :121   Fa  :  45   Av  :221    
##  Fa: 14    Fa:  28   CBlock:634   Fa  : 35   Gd  :  65   Gd  :134    
##  Gd:488    Gd: 146   PConc :647   Gd  :618   Po  :   2   Mn  :114    
##  TA:906    Po:   1   Slab  : 24   TA  :649   TA  :1311   No  :953    
##            TA:1282   Stone :  6   NA's: 37   NA's:  37   NA's: 38    
##                      Wood  :  3                                      
##                                                                      
##  BsmtFinType1   BsmtFinSF1     BsmtFinType2   BsmtFinSF2     
##  ALQ :220     Min.   :   0.0   ALQ :  19    Min.   :   0.00  
##  BLQ :148     1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00  
##  GLQ :418     Median : 383.5   GLQ :  14    Median :   0.00  
##  LwQ : 74     Mean   : 443.6   LwQ :  46    Mean   :  46.55  
##  Rec :133     3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00  
##  Unf :430     Max.   :5644.0   Unf :1256    Max.   :1474.00  
##  NA's: 37                      NA's:  38                     
##    BsmtUnfSF       TotalBsmtSF      Heating     HeatingQC CentralAir
##  Min.   :   0.0   Min.   :   0.0   Floor:   1   Ex:741    N:  95    
##  1st Qu.: 223.0   1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365    
##  Median : 477.5   Median : 991.5   GasW :  18   Gd:241              
##  Mean   : 567.2   Mean   :1057.4   Grav :   7   Po:  1              
##  3rd Qu.: 808.0   3rd Qu.:1298.2   OthW :   2   TA:428              
##  Max.   :2336.0   Max.   :6110.0   Wall :   4                       
##                                                                     
##  Electrical     X1stFlrSF      X2ndFlrSF     LowQualFinSF    
##  FuseA:  94   Min.   : 334   Min.   :   0   Min.   :  0.000  
##  FuseF:  27   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  FuseP:   3   Median :1087   Median :   0   Median :  0.000  
##  Mix  :   1   Mean   :1163   Mean   : 347   Mean   :  5.845  
##  SBrkr:1334   3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##  NA's :   1   Max.   :4692   Max.   :2065   Max.   :572.000  
##                                                              
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##                                                                   
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex:100     
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa: 39     
##  Median :0.0000   Median :3.000   Median :1.000   Gd:586     
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047   TA:735     
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000              
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000              
##                                                              
##   TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu   GarageType 
##  Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24    2Types :  6  
##  1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33    Attchd :870  
##  Median : 6.000   Min1:  31   Median :1.000   Gd  :380    Basment: 19  
##  Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20    BuiltIn: 88  
##  3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313    CarPort:  9  
##  Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690    Detchd :387  
##                   Typ :1360                               NA's   : 81  
##   GarageYrBlt   GarageFinish   GarageCars      GarageArea     GarageQual 
##  Min.   :1900   Fin :352     Min.   :0.000   Min.   :   0.0   Ex  :   3  
##  1st Qu.:1961   RFn :422     1st Qu.:1.000   1st Qu.: 334.5   Fa  :  48  
##  Median :1980   Unf :605     Median :2.000   Median : 480.0   Gd  :  14  
##  Mean   :1979   NA's: 81     Mean   :1.767   Mean   : 473.0   Po  :   3  
##  3rd Qu.:2002                3rd Qu.:2.000   3rd Qu.: 576.0   TA  :1311  
##  Max.   :2010                Max.   :4.000   Max.   :1418.0   NA's:  81  
##  NA's   :81                                                              
##  GarageCond  PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex  :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa  :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd  :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  Po  :   7              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  TA  :1326              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  NA's:  81              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                           
##    X3SsnPorch      ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex  :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd  :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   NA's:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.00   Max.   :480.00   Max.   :738.000              
##                                                                 
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv:  59   Gar2:   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo :  54   Othr:   2   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv: 157   Shed:  49   Median :    0.00   Median : 6.000  
##  MnWw :  11   TenC:   1   Mean   :   43.49   Mean   : 6.322  
##  NA's :1179   NA's:1406   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :15500.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9
#BsmtUnfSF has median < mean. Let's choose this.
summary(data$BsmtUnfSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   223.0   477.5   567.2   808.0  2336.0
#To me, BsmtUnfSF = 0 does not make sense. Let's filter these out
library(MASS)
library(rcompanion)
Turbidity = data$BsmtUnfSF
Turbidity = Turbidity[Turbidity>0]
Turbidity
##    [1]  150  284  434  540  490   64  317  216  952  140  134  177  175
##   [14] 1494  520  832  426  468  525 1158  637 1777  200  204 1566  180
##   [27]  486  207  520  649 1228 1234  380  408 1117 1097   84  326  445
##   [40]  383  167  465 1296   83 1632  736  192  612  816   32  935  321
##   [53]  860 1410  148  217  530 1346  576  318 1143 1035  440  747  701
##   [66]  343  280  832  404  840  724  295 1768  440  448   36 1530 1065
##   [79]  384 1288  684  612 1013  402  635  163  168  176  370  426  440
##   [92]  350  381  410  741 1226  816 1053  641  516  793 1139  550  134
##  [105]  280  905  104  310  252 1125  203  728  732  510  899 1362   30
##  [118]  958  556  148  413  479  297  658  262  891 1304  519 1907  336
##  [131]  107  432  434  403  811  396  970  506  884  400  896  253  310
##  [144]  409   93 1200  572  774  769 1335  572  556  340  882  779  112
##  [157]  470  294  840 1686  360  441  354  700  725  320  554  312  968
##  [170]  320  441 1362  504 1107  577  660  440  556   99  871  970  474
##  [183]  289  600  140  755  625 1121  276  186  408 1424 1140  375   92
##  [196]  305  396 1176   78  274  311  710  490  686  457 1232 1498 1010
##  [209]  163  160 2336  630  638  162   70 1357 1194  773  483  235  125
##  [222] 1390  594 1694  641  488  357  540  626  253  916 1020 1367  840
##  [235]  747  728  798  452  392  975  361  270  602 1482  680  606   88
##  [248]  342  212  392 1095   96  628  270  952 1560  744 2121  768  386
##  [261]  357  410 1468 1145  625  312  244  432  698 1079  570  476  131
##  [274]  184  490  326  143 1092  324  747 1541 1470  536  319  599  622
##  [287]  179  292  286   80  712  291  153 1088  360  336 1249  166  906
##  [300]  710  604  100  818  844  596 1424  210 1603  638  115  103  673
##  [313]  726  995  630  967  721 1656  175  972  460  208  191  438 1869
##  [326]  371  624  552  322  598  268  468  130  115  484  321   84  216
##  [339]  785  728  728  733  953  847  333  572 1580  411  982  808 1293
##  [352]  939  784  595 1232  658  410 1468  402  229  114  522  735  405
##  [365]  117  324  961  280  474 1286  672 1141  806  165 1064  840 1063
##  [378]  245 1276  892 1008  499 1316  463  242  444  281   35  356  988
##  [391]  484  580  651  619  544  387   96  901  294  926  135   70  648
##  [404]  884   75  684  788 1307 1078 1258  273 1436  557  930  780  318
##  [417]  813  878  130  768  122  326  624  248  588  524  288  389  424
##  [430] 1375  499  342 1626  406  808   88  626  298  340  484 2153  417
##  [443]  739  572  225  611  319  411  506  237  486  290  115  264  238
##  [456]  728  363  190  225 1969  697  414  522  316  466  420  254  103
##  [469]  960  397 1191  548   50  178 1368  169  748  768  570  689 1264
##  [482]   88 1276  467  605  878  660 1257  551  122  180  816  678  611
##  [495]  707  148  880  264  378  223  578  969  379  100  765  149  912
##  [508]  510  620 1709  132  993  197  125 1374   90  195  706 1163  367
##  [521]  806 1122 1515   55 1497  450  846  384   23  390  861  285  689
##  [534] 1050  331 2042 1237  884  408  168  113  742  280  384  163  924
##  [547]  392  684 1258   23  512  780  119  600  572  314  308  293  537
##  [560]  126  536  427  309  914  173  326  832 1774  611  823  384  420
##  [573]  336  485 1116  978  350  390  288  636 1530  564  108 1184  264
##  [586]  811  796  366  300  319  542  645  664  594  756  813  755  880
##  [599]  756  413  525  247  776  912  849  793   88 1392   38  356 1406
##  [612]  111  270  200  700  545  121  441  291  244  544 1095  768 2046
##  [625]  161  261  611  288  567 1195  180  874  312  474 1342  151  989
##  [638]  245 1073  927   92  219  224 1375  526 1164 1234  360  761  424
##  [651]  461  728  876  270  859  461  171  192 1064  718   92  138  448
##  [664]  594  186  673  941  464  250   72  508 1584  628  415   82  901
##  [677]  270  948  490  893  864  264   80 1349   76  604  487  652 1240
##  [690]  801  576  660  279 1030  115  348  846  234  195 1198  252  740
##  [703]  732   89 1498  586  323 1836  234  173  480  456 1935  338 1594
##  [716]  102 1237  374 1413  742  491  300  901  264 1129  490  255 1496
##  [729]  712  650  660  203 1926  162  154  999   80 1734  124 1417  100
##  [742]   15  380  849  186  540  834  686 1649  522  350  506  625  798
##  [755]  936  847  778 1489  442 1434  600  352  600  458  975  572  625
##  [768] 1221  153 1099  416  516  650  276 1800  876  227  404  907  528
##  [781]  189 1273  554  563  372  381  702 1090  435  912  198  702 1372
##  [794]  174 1638  108  300  894  299  105  457  676 1120  431  292  210
##  [807]  218  110  254  808  795  460  460 1098  816 1043  481  672  192
##  [820]  396  319  380  666  142  447  536  132  783 1670  277  412 1560
##  [833]  794  239  742  662 1072  279  717  318  546  430   75  422  245
##  [846]  114  188 1288  598  266 1181  280 1753  964 1450  121 1905 1480
##  [859]  160  600  343  772  927 1032  396  220 1632  354  102  316  936
##  [872]  179  317  187  108   29  495  276  640  638   92  162  434  248
##  [885] 1800  193  783  300  196  600   75  720  197  918 1428  728   32
##  [898]  440  135  342  470   77  371 1266 1128  124  485  284  692  770
##  [911]  322  700  169  750  528  363  135 1442 1007  501  691 1550 1680
##  [924] 1330  390  420 1710 1008  720  602  310  746  167  814  184  384
##  [937] 1346  108  515  588 1330  276  571  125  359  355  686  301  326
##  [950]  668  920  598 1055  546  121  284  336  406  390 1420 1752  304
##  [963] 1302 1316  374  833  133  549  705  378  168  722  894  662  706
##  [976]  184  105  799  105  625  462  429  810  155 1240  390  170  230
##  [989]  186  495  602  216 1459  698   99  189  212 1082  970  208   90
## [1002]  758  203  448 1290  684 1074  567  251  422  630  431  172  868
## [1015]  924  797  554  400  108  365  418  730   55  192  533  671 1012
## [1028] 1528  672  698  384 1005 1373  230  847  500  762 1008  544  916
## [1041]  960  752  780  270  100  399  316  718  324 1042   40  429  572
## [1054]   26  932 1290  230  278  410  280  586  410 1580  459  544  568
## [1067]  402  289 1502 1694  173  501  543  574  599  625  977  449  983
## [1080]  218  350  731  120  300  299  413  392  538  168  831  994   90
## [1093]  776  702  341  292  728  879  815 1212  504  864  866  884 1630
## [1106]  268 1496 1342  319  440 1055  132  328  141  340  364  672  210
## [1119]  844 1380   64   81  894  317  162  409  652  303  188  940  747
## [1132]  764  847 1141 1048  334 1689  690  792  585  756  473  416  246
## [1145] 1045 1405  354   36  746  459  672  864  201   14  841  546 1104
## [1158]  764 1139  241  925 2002  536  414   74 1489  300  661  708  130
## [1171] 1152  324  698  785  256  364  912  804  780  678  812  343 1085
## [1184]  138  399  994  638  697   36  344  466  284  224  425 1616  976
## [1197]   80 1368  244  172   78  496  349  971 1393  216  176 1622 1352
## [1210] 1753  372  628   76  140 1795  796  175 1017  935 1588  428  126
## [1223]  803  664 1656  693  216  504  858  300 1284  896  728  710 1203
## [1236] 1652   39  425  748  539  698 1217  257  570  524  344  378  533
## [1249]  612  256  715  616  600  281  240   36  163  315  420  161  133
## [1262] 1351 1026 1571  384  156  174  384  661  340  596  816  356   61
## [1275]  133  426  360  125 1584   95  482  286 1094   60  939  676  712
## [1288]  862   80 1286  556  672  221  112  208  622  791  278  736  868
## [1301]  833  398  777  503  247  734  304  709  162  697  193 1252  223
## [1314]  333  278  762  732  656  936  190 1319  248  596  312  114  588
## [1327]  151  252  952 1422  595  141  560   77  896 1573 1140  811  953
## [1340]  589  877  136
#Let's plot normal histogram of filtered data
plotNormalHistogram(Turbidity)

qqnorm(Turbidity,
       ylab="Sample Quantiles for Turbidity")
qqline(Turbidity, 
       col="red")

#The data is indeed postively skewed and QQ plot shows the data has exponential shape.

#Square root transformation
T_sqrt = sqrt(Turbidity)
plotNormalHistogram(T_sqrt)

summary(T_sqrt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.742  16.971  23.152  23.309  29.039  48.332
#Cube -- from the shape, it looks like cube works the best
T_cub = sign(Turbidity) * abs(Turbidity)^(1/3)   # Avoid complex numbers 
plotNormalHistogram(T_cub)

summary(T_cub)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.410   6.604   8.123   8.029   9.448  13.269
#Log
T_log = log(Turbidity)
plotNormalHistogram(T_log)

summary(T_log)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.639   5.663   6.284   6.142   6.737   7.756
#Testing fit distr
estimate_original <- fitdistr(Turbidity, "exponential")
estimate_sqrt <- fitdistr(T_sqrt, "exponential")
estimate_cube <- fitdistr(T_cub, "exponential")
estimate_log <- fitdistr(T_log, "exponential")


#Let's compare each estimate


#original vs original_exp

#Skewness
Turbidity_exp = rexp(1000,rate = estimate_original$estimate)

plotNormalHistogram(Turbidity)

plotNormalHistogram(Turbidity_exp)

#quantile 5%, 95%
quantile(Turbidity, c(.05, .95))
##   5%  95% 
##  100 1489
quantile(Turbidity_exp, c(.05, .95))
##         5%        95% 
##   32.75834 1879.13611
summary(Turbidity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.0   288.0   536.0   617.1   843.2  2336.0
summary(Turbidity_exp)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.797  173.395  421.810  626.249  893.184 3556.700
#original vs original_sqrt_exp

#Skewness
T_sqrt_exp = rexp(1000,rate = estimate_sqrt$estimate)

plotNormalHistogram(Turbidity)

plotNormalHistogram(T_sqrt_exp)

#quantile 5%, 95%
quantile(Turbidity, c(.05, .95))
##   5%  95% 
##  100 1489
quantile(T_sqrt_exp, c(.05, .95))
##        5%       95% 
##  1.338165 67.641899
#original vs original_cube_exp

#Skewness
T_cube_exp = rexp(1000,rate = estimate_cube$estimate)

plotNormalHistogram(Turbidity)

plotNormalHistogram(T_cube_exp)

#quantile 5%, 95%
quantile(Turbidity, c(.05, .95))
##   5%  95% 
##  100 1489
quantile(T_cube_exp, c(.05, .95))
##        5%       95% 
##  0.376118 22.448134
#original vs original_log_exp

#Skewness
T_log_exp = rexp(1000,rate = estimate_log$estimate)

plotNormalHistogram(Turbidity)

plotNormalHistogram(T_log_exp)

#quantile 5%, 95%
quantile(Turbidity, c(.05, .95))
##   5%  95% 
##  100 1489
quantile(T_log_exp, c(.05, .95))
##         5%        95% 
##  0.2458815 17.4323193
#95% CI for empirical data, assuming normality
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
CI(Turbidity, ci=0.95)  
##    upper     mean    lower 
## 639.9388 617.1170 594.2951
#In conclusion, we can see that exponential density function transform original datasets, whether they were transformed to log, sqrt, cube or stayed original, to exponential shape. Note that datasets applied with exponential function are more heavily skewed to the right than original datasets, according to Normal Histogram.

10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

#Some features missing too many values. Need to exclude these features.
library(DataExplorer)
plot_missing(data)[1]

## $data
##          feature num_missing  pct_missing  group
## 1             Id           0 0.0000000000   Good
## 2     MSSubClass           0 0.0000000000   Good
## 3       MSZoning           0 0.0000000000   Good
## 4    LotFrontage         259 0.1773972603     OK
## 5        LotArea           0 0.0000000000   Good
## 6         Street           0 0.0000000000   Good
## 7          Alley        1369 0.9376712329 Remove
## 8       LotShape           0 0.0000000000   Good
## 9    LandContour           0 0.0000000000   Good
## 10     Utilities           0 0.0000000000   Good
## 11     LotConfig           0 0.0000000000   Good
## 12     LandSlope           0 0.0000000000   Good
## 13  Neighborhood           0 0.0000000000   Good
## 14    Condition1           0 0.0000000000   Good
## 15    Condition2           0 0.0000000000   Good
## 16      BldgType           0 0.0000000000   Good
## 17    HouseStyle           0 0.0000000000   Good
## 18   OverallQual           0 0.0000000000   Good
## 19   OverallCond           0 0.0000000000   Good
## 20     YearBuilt           0 0.0000000000   Good
## 21  YearRemodAdd           0 0.0000000000   Good
## 22     RoofStyle           0 0.0000000000   Good
## 23      RoofMatl           0 0.0000000000   Good
## 24   Exterior1st           0 0.0000000000   Good
## 25   Exterior2nd           0 0.0000000000   Good
## 26    MasVnrType           8 0.0054794521   Good
## 27    MasVnrArea           8 0.0054794521   Good
## 28     ExterQual           0 0.0000000000   Good
## 29     ExterCond           0 0.0000000000   Good
## 30    Foundation           0 0.0000000000   Good
## 31      BsmtQual          37 0.0253424658   Good
## 32      BsmtCond          37 0.0253424658   Good
## 33  BsmtExposure          38 0.0260273973   Good
## 34  BsmtFinType1          37 0.0253424658   Good
## 35    BsmtFinSF1           0 0.0000000000   Good
## 36  BsmtFinType2          38 0.0260273973   Good
## 37    BsmtFinSF2           0 0.0000000000   Good
## 38     BsmtUnfSF           0 0.0000000000   Good
## 39   TotalBsmtSF           0 0.0000000000   Good
## 40       Heating           0 0.0000000000   Good
## 41     HeatingQC           0 0.0000000000   Good
## 42    CentralAir           0 0.0000000000   Good
## 43    Electrical           1 0.0006849315   Good
## 44     X1stFlrSF           0 0.0000000000   Good
## 45     X2ndFlrSF           0 0.0000000000   Good
## 46  LowQualFinSF           0 0.0000000000   Good
## 47     GrLivArea           0 0.0000000000   Good
## 48  BsmtFullBath           0 0.0000000000   Good
## 49  BsmtHalfBath           0 0.0000000000   Good
## 50      FullBath           0 0.0000000000   Good
## 51      HalfBath           0 0.0000000000   Good
## 52  BedroomAbvGr           0 0.0000000000   Good
## 53  KitchenAbvGr           0 0.0000000000   Good
## 54   KitchenQual           0 0.0000000000   Good
## 55  TotRmsAbvGrd           0 0.0000000000   Good
## 56    Functional           0 0.0000000000   Good
## 57    Fireplaces           0 0.0000000000   Good
## 58   FireplaceQu         690 0.4726027397    Bad
## 59    GarageType          81 0.0554794521     OK
## 60   GarageYrBlt          81 0.0554794521     OK
## 61  GarageFinish          81 0.0554794521     OK
## 62    GarageCars           0 0.0000000000   Good
## 63    GarageArea           0 0.0000000000   Good
## 64    GarageQual          81 0.0554794521     OK
## 65    GarageCond          81 0.0554794521     OK
## 66    PavedDrive           0 0.0000000000   Good
## 67    WoodDeckSF           0 0.0000000000   Good
## 68   OpenPorchSF           0 0.0000000000   Good
## 69 EnclosedPorch           0 0.0000000000   Good
## 70    X3SsnPorch           0 0.0000000000   Good
## 71   ScreenPorch           0 0.0000000000   Good
## 72      PoolArea           0 0.0000000000   Good
## 73        PoolQC        1453 0.9952054795 Remove
## 74         Fence        1179 0.8075342466 Remove
## 75   MiscFeature        1406 0.9630136986 Remove
## 76       MiscVal           0 0.0000000000   Good
## 77        MoSold           0 0.0000000000   Good
## 78        YrSold           0 0.0000000000   Good
## 79      SaleType           0 0.0000000000   Good
## 80 SaleCondition           0 0.0000000000   Good
## 81     SalePrice           0 0.0000000000   Good
feature_data_good <- plot_missing(data)$data[plot_missing(data)$data$group == 'Good',]

feature_good_list <- feature_data_good$feature
feature_good_list <- as.character(feature_good_list)
data_good <- data[c(feature_good_list)]
model <- lm(SalePrice ~ ., data=data_good)
summary(model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = data_good)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -181959   -9704     370    9644  181959 
## 
## Coefficients: (3 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.006e+06  1.070e+06  -0.941 0.347054    
## Id                    6.611e-01  1.591e+00   0.416 0.677767    
## MSSubClass           -3.948e+01  8.538e+01  -0.462 0.643913    
## MSZoningFV            3.610e+04  1.215e+04   2.972 0.003022 ** 
## MSZoningRH            2.449e+04  1.211e+04   2.022 0.043427 *  
## MSZoningRL            2.690e+04  1.039e+04   2.588 0.009760 ** 
## MSZoningRM            2.334e+04  9.693e+03   2.408 0.016187 *  
## LotArea               7.549e-01  1.098e-01   6.877 9.81e-12 ***
## StreetPave            3.578e+04  1.213e+04   2.950 0.003236 ** 
## LotShapeIR2           6.483e+03  4.282e+03   1.514 0.130277    
## LotShapeIR3           4.755e+03  8.911e+03   0.534 0.593701    
## LotShapeReg           1.067e+03  1.640e+03   0.650 0.515663    
## LandContourHLS        9.008e+03  5.246e+03   1.717 0.086208 .  
## LandContourLow       -1.027e+04  6.712e+03  -1.530 0.126159    
## LandContourLvl        6.210e+03  3.813e+03   1.629 0.103634    
## UtilitiesNoSeWa      -3.133e+04  2.621e+04  -1.195 0.232199    
## LotConfigCulDSac      8.072e+03  3.281e+03   2.461 0.014010 *  
## LotConfigFR2         -7.753e+03  4.092e+03  -1.895 0.058383 .  
## LotConfigFR3         -1.418e+04  1.273e+04  -1.114 0.265549    
## LotConfigInside      -1.536e+03  1.785e+03  -0.860 0.389905    
## LandSlopeMod          7.331e+03  4.061e+03   1.805 0.071319 .  
## LandSlopeSev         -3.948e+04  1.154e+04  -3.420 0.000646 ***
## NeighborhoodBlueste  -2.683e+03  1.926e+04  -0.139 0.889230    
## NeighborhoodBrDale    1.170e+03  1.110e+04   0.105 0.916010    
## NeighborhoodBrkSide  -3.549e+03  9.534e+03  -0.372 0.709818    
## NeighborhoodClearCr  -1.835e+04  9.398e+03  -1.952 0.051165 .  
## NeighborhoodCollgCr  -1.195e+04  7.262e+03  -1.645 0.100217    
## NeighborhoodCrawfor   9.752e+03  8.624e+03   1.131 0.258381    
## NeighborhoodEdwards  -2.006e+04  8.076e+03  -2.484 0.013109 *  
## NeighborhoodGilbert  -1.311e+04  7.796e+03  -1.682 0.092910 .  
## NeighborhoodIDOTRR   -8.886e+03  1.084e+04  -0.819 0.412675    
## NeighborhoodMeadowV  -8.485e+03  1.132e+04  -0.750 0.453674    
## NeighborhoodMitchel  -2.417e+04  8.259e+03  -2.927 0.003488 ** 
## NeighborhoodNAmes    -1.836e+04  7.868e+03  -2.333 0.019810 *  
## NeighborhoodNoRidge   2.332e+04  8.508e+03   2.741 0.006209 ** 
## NeighborhoodNPkVill   8.051e+03  1.410e+04   0.571 0.568013    
## NeighborhoodNridgHt   1.550e+04  7.451e+03   2.081 0.037684 *  
## NeighborhoodNWAmes   -2.072e+04  8.081e+03  -2.564 0.010458 *  
## NeighborhoodOldTown  -1.444e+04  9.722e+03  -1.486 0.137609    
## NeighborhoodSawyer   -1.190e+04  8.203e+03  -1.451 0.147112    
## NeighborhoodSawyerW  -5.958e+03  7.870e+03  -0.757 0.449228    
## NeighborhoodSomerst  -4.634e+03  9.066e+03  -0.511 0.609317    
## NeighborhoodStoneBr   3.450e+04  8.352e+03   4.130 3.87e-05 ***
## NeighborhoodSWISU    -9.436e+03  9.700e+03  -0.973 0.330885    
## NeighborhoodTimber   -1.254e+04  8.240e+03  -1.521 0.128403    
## NeighborhoodVeenker  -3.513e+03  1.060e+04  -0.332 0.740306    
## Condition1Feedr       1.914e+03  5.082e+03   0.377 0.706506    
## Condition1Norm        1.090e+04  4.183e+03   2.606 0.009270 ** 
## Condition1PosA        3.346e+03  1.006e+04   0.333 0.739471    
## Condition1PosN        7.609e+03  7.481e+03   1.017 0.309278    
## Condition1RRAe       -1.838e+04  9.531e+03  -1.929 0.053988 .  
## Condition1RRAn        4.855e+03  6.893e+03   0.704 0.481320    
## Condition1RRNe       -6.197e+03  1.783e+04  -0.348 0.728236    
## Condition1RRNn        2.711e+03  1.289e+04   0.210 0.833389    
## Condition2Feedr      -6.364e+02  2.255e+04  -0.028 0.977492    
## Condition2Norm       -1.456e+03  1.922e+04  -0.076 0.939649    
## Condition2PosA        4.036e+04  3.707e+04   1.089 0.276560    
## Condition2PosN       -2.317e+05  2.700e+04  -8.582  < 2e-16 ***
## Condition2RRAe       -1.262e+05  4.608e+04  -2.738 0.006271 ** 
## Condition2RRAn       -8.232e+03  3.103e+04  -0.265 0.790826    
## Condition2RRNn        3.070e+03  2.654e+04   0.116 0.907923    
## BldgType2fmCon       -4.050e+03  1.288e+04  -0.315 0.753166    
## BldgTypeDuplex       -7.155e+03  7.528e+03  -0.950 0.342053    
## BldgTypeTwnhs        -1.990e+04  1.017e+04  -1.956 0.050717 .  
## BldgTypeTwnhsE       -1.742e+04  9.235e+03  -1.886 0.059491 .  
## HouseStyle1.5Unf      1.421e+04  7.776e+03   1.827 0.067903 .  
## HouseStyle1Story      1.012e+04  4.393e+03   2.304 0.021412 *  
## HouseStyle2.5Fin     -1.650e+04  1.201e+04  -1.374 0.169816    
## HouseStyle2.5Unf     -6.288e+03  9.184e+03  -0.685 0.493708    
## HouseStyle2Story     -5.837e+03  3.557e+03  -1.641 0.101052    
## HouseStyleSFoyer      6.114e+03  6.540e+03   0.935 0.350078    
## HouseStyleSLvl        4.373e+03  5.630e+03   0.777 0.437476    
## OverallQual           6.734e+03  1.033e+03   6.520 1.04e-10 ***
## OverallCond           5.892e+03  8.873e+02   6.641 4.71e-11 ***
## YearBuilt             3.163e+02  7.548e+01   4.191 2.98e-05 ***
## YearRemodAdd          1.065e+02  5.645e+01   1.886 0.059548 .  
## RoofStyleGable        8.891e+03  1.853e+04   0.480 0.631547    
## RoofStyleGambrel      1.211e+04  2.021e+04   0.599 0.549015    
## RoofStyleHip          9.383e+03  1.859e+04   0.505 0.613784    
## RoofStyleMansard      2.528e+04  2.160e+04   1.170 0.242045    
## RoofStyleShed         1.041e+05  3.501e+04   2.973 0.003010 ** 
## RoofMatlCompShg       6.599e+05  3.324e+04  19.851  < 2e-16 ***
## RoofMatlMembran       7.559e+05  4.840e+04  15.620  < 2e-16 ***
## RoofMatlMetal         7.301e+05  4.723e+04  15.457  < 2e-16 ***
## RoofMatlRoll          6.498e+05  4.176e+04  15.562  < 2e-16 ***
## RoofMatlTar&Grv       6.643e+05  3.829e+04  17.350  < 2e-16 ***
## RoofMatlWdShake       6.470e+05  3.686e+04  17.555  < 2e-16 ***
## RoofMatlWdShngl       7.311e+05  3.434e+04  21.288  < 2e-16 ***
## Exterior1stBrkComm   -3.561e+04  3.359e+04  -1.060 0.289248    
## Exterior1stBrkFace    1.073e+04  1.318e+04   0.814 0.416040    
## Exterior1stCBlock    -2.325e+04  2.751e+04  -0.845 0.398158    
## Exterior1stCemntBd   -1.047e+04  1.937e+04  -0.541 0.588877    
## Exterior1stHdBoard   -1.076e+04  1.324e+04  -0.812 0.416738    
## Exterior1stImStucc   -4.708e+04  2.802e+04  -1.680 0.093180 .  
## Exterior1stMetalSd   -3.445e+03  1.491e+04  -0.231 0.817275    
## Exterior1stPlywood   -1.304e+04  1.314e+04  -0.993 0.321080    
## Exterior1stStone      5.595e+02  2.643e+04   0.021 0.983114    
## Exterior1stStucco    -8.951e+03  1.444e+04  -0.620 0.535361    
## Exterior1stVinylSd   -1.241e+04  1.352e+04  -0.919 0.358516    
## Exterior1stWd Sdng   -9.846e+03  1.272e+04  -0.774 0.439199    
## Exterior1stWdShing   -7.551e+03  1.374e+04  -0.549 0.582846    
## Exterior2ndAsphShn    5.240e+03  2.258e+04   0.232 0.816529    
## Exterior2ndBrk Cmn    7.886e+03  2.055e+04   0.384 0.701224    
## Exterior2ndBrkFace   -1.617e+03  1.376e+04  -0.117 0.906517    
## Exterior2ndCBlock            NA         NA      NA       NA    
## Exterior2ndCmentBd    1.109e+04  1.915e+04   0.579 0.562694    
## Exterior2ndHdBoard    5.289e+03  1.290e+04   0.410 0.681974    
## Exterior2ndImStucc    2.495e+04  1.463e+04   1.705 0.088487 .  
## Exterior2ndMetalSd    2.348e+03  1.463e+04   0.161 0.872514    
## Exterior2ndOther     -1.546e+04  2.769e+04  -0.558 0.576672    
## Exterior2ndPlywood    4.003e+03  1.257e+04   0.318 0.750237    
## Exterior2ndStone     -2.029e+04  2.395e+04  -0.847 0.397089    
## Exterior2ndStucco     7.966e+03  1.389e+04   0.573 0.566470    
## Exterior2ndVinylSd    1.007e+04  1.307e+04   0.771 0.441003    
## Exterior2ndWd Sdng    7.152e+03  1.240e+04   0.577 0.564319    
## Exterior2ndWd Shng    1.643e+03  1.283e+04   0.128 0.898071    
## MasVnrTypeBrkFace     6.662e+03  6.891e+03   0.967 0.333864    
## MasVnrTypeNone        1.059e+04  6.944e+03   1.525 0.127544    
## MasVnrTypeStone       1.168e+04  7.277e+03   1.605 0.108713    
## MasVnrArea            2.129e+01  5.844e+00   3.642 0.000282 ***
## ExterQualFa          -3.646e+03  1.161e+04  -0.314 0.753468    
## ExterQualGd          -1.781e+04  4.877e+03  -3.651 0.000272 ***
## ExterQualTA          -1.854e+04  5.394e+03  -3.437 0.000609 ***
## ExterCondFa          -3.950e+03  1.847e+04  -0.214 0.830725    
## ExterCondGd          -8.644e+03  1.746e+04  -0.495 0.620556    
## ExterCondPo           3.342e+03  3.209e+04   0.104 0.917065    
## ExterCondTA          -7.094e+03  1.742e+04  -0.407 0.683949    
## FoundationCBlock      3.202e+03  3.203e+03   1.000 0.317646    
## FoundationPConc       4.697e+03  3.446e+03   1.363 0.173151    
## FoundationStone       5.081e+03  1.101e+04   0.462 0.644512    
## FoundationWood       -2.619e+04  1.471e+04  -1.780 0.075334 .  
## BsmtQualFa           -1.374e+04  6.430e+03  -2.137 0.032820 *  
## BsmtQualGd           -1.866e+04  3.335e+03  -5.596 2.71e-08 ***
## BsmtQualTA           -1.576e+04  4.142e+03  -3.805 0.000149 ***
## BsmtCondGd           -1.188e+03  5.344e+03  -0.222 0.824089    
## BsmtCondPo            6.785e+04  3.079e+04   2.203 0.027758 *  
## BsmtCondTA            3.434e+03  4.289e+03   0.801 0.423570    
## BsmtExposureGd        1.235e+04  3.048e+03   4.053 5.37e-05 ***
## BsmtExposureMn       -3.993e+03  3.053e+03  -1.308 0.191132    
## BsmtExposureNo       -6.070e+03  2.222e+03  -2.731 0.006403 ** 
## BsmtFinType1BLQ       1.013e+03  2.792e+03   0.363 0.716793    
## BsmtFinType1GLQ       5.221e+03  2.549e+03   2.048 0.040771 *  
## BsmtFinType1LwQ      -3.931e+03  3.765e+03  -1.044 0.296580    
## BsmtFinType1Rec      -6.068e+02  3.025e+03  -0.201 0.841069    
## BsmtFinType1Unf       2.605e+03  2.954e+03   0.882 0.378033    
## BsmtFinSF1            3.850e+01  5.555e+00   6.930 6.83e-12 ***
## BsmtFinType2BLQ      -1.477e+04  7.637e+03  -1.934 0.053365 .  
## BsmtFinType2GLQ      -4.868e+03  9.447e+03  -0.515 0.606458    
## BsmtFinType2LwQ      -1.597e+04  7.450e+03  -2.144 0.032250 *  
## BsmtFinType2Rec      -1.096e+04  7.189e+03  -1.525 0.127646    
## BsmtFinType2Unf      -1.076e+04  7.640e+03  -1.408 0.159400    
## BsmtFinSF2            2.907e+01  9.303e+00   3.125 0.001820 ** 
## BsmtUnfSF             1.975e+01  5.108e+00   3.866 0.000117 ***
## TotalBsmtSF                  NA         NA      NA       NA    
## HeatingGasW          -6.440e+03  7.120e+03  -0.905 0.365906    
## HeatingGrav          -4.104e+03  1.127e+04  -0.364 0.715678    
## HeatingOthW          -2.684e+04  1.873e+04  -1.433 0.152194    
## HeatingQCFa          -6.364e+01  4.904e+03  -0.013 0.989648    
## HeatingQCGd          -3.394e+03  2.106e+03  -1.612 0.107207    
## HeatingQCPo           5.322e+03  2.722e+04   0.195 0.845036    
## HeatingQCTA          -3.206e+03  2.102e+03  -1.525 0.127437    
## CentralAirY           9.611e+02  4.030e+03   0.238 0.811540    
## ElectricalFuseF      -1.775e+03  6.410e+03  -0.277 0.781913    
## ElectricalFuseP      -1.065e+04  2.008e+04  -0.531 0.595689    
## ElectricalMix        -6.154e+04  4.076e+04  -1.510 0.131318    
## ElectricalSBrkr      -9.967e+02  3.046e+03  -0.327 0.743575    
## X1stFlrSF             4.683e+01  5.895e+00   7.944 4.47e-15 ***
## X2ndFlrSF             6.913e+01  5.327e+00  12.978  < 2e-16 ***
## LowQualFinSF          2.646e+01  1.828e+01   1.448 0.147897    
## GrLivArea                    NA         NA      NA       NA    
## BsmtFullBath          7.242e+02  1.999e+03   0.362 0.717188    
## BsmtHalfBath         -1.048e+03  3.059e+03  -0.343 0.731953    
## FullBath              3.097e+03  2.244e+03   1.380 0.167911    
## HalfBath              3.661e+02  2.126e+03   0.172 0.863321    
## BedroomAbvGr         -3.396e+03  1.402e+03  -2.422 0.015598 *  
## KitchenAbvGr         -1.360e+04  5.824e+03  -2.335 0.019719 *  
## KitchenQualFa        -1.953e+04  6.294e+03  -3.102 0.001964 ** 
## KitchenQualGd        -2.555e+04  3.444e+03  -7.419 2.23e-13 ***
## KitchenQualTA        -2.284e+04  3.924e+03  -5.821 7.50e-09 ***
## TotRmsAbvGrd          1.275e+03  9.710e+02   1.313 0.189346    
## FunctionalMaj2       -5.392e+03  1.472e+04  -0.366 0.714275    
## FunctionalMin1        4.241e+03  8.945e+03   0.474 0.635535    
## FunctionalMin2        5.763e+03  8.979e+03   0.642 0.521147    
## FunctionalMod        -5.006e+03  1.162e+04  -0.431 0.666651    
## FunctionalSev        -3.598e+04  2.810e+04  -1.281 0.200583    
## FunctionalTyp         1.547e+04  7.737e+03   2.000 0.045758 *  
## Fireplaces            2.768e+03  1.362e+03   2.032 0.042335 *  
## GarageCars            3.165e+03  2.192e+03   1.444 0.148999    
## GarageArea            1.459e+01  7.556e+00   1.930 0.053808 .  
## PavedDriveP          -1.969e+03  5.560e+03  -0.354 0.723243    
## PavedDriveY           5.451e+02  3.574e+03   0.153 0.878810    
## WoodDeckSF            9.883e+00  5.892e+00   1.677 0.093724 .  
## OpenPorchSF           3.924e+00  1.168e+01   0.336 0.737036    
## EnclosedPorch         5.352e+00  1.271e+01   0.421 0.673795    
## X3SsnPorch            2.682e+01  2.281e+01   1.176 0.239960    
## ScreenPorch           3.469e+01  1.227e+01   2.829 0.004754 ** 
## PoolArea              7.976e+01  1.835e+01   4.347 1.50e-05 ***
## MiscVal              -3.337e-01  1.449e+00  -0.230 0.817912    
## MoSold               -3.818e+02  2.505e+02  -1.524 0.127722    
## YrSold               -2.673e+02  5.260e+02  -0.508 0.611375    
## SaleTypeCon           3.000e+04  1.786e+04   1.680 0.093256 .  
## SaleTypeConLD         1.938e+04  1.044e+04   1.855 0.063820 .  
## SaleTypeConLI         5.450e+03  1.164e+04   0.468 0.639738    
## SaleTypeConLw        -2.087e+03  1.229e+04  -0.170 0.865209    
## SaleTypeCWD           1.545e+04  1.311e+04   1.179 0.238606    
## SaleTypeNew           2.260e+04  1.578e+04   1.432 0.152454    
## SaleTypeOth           1.040e+04  1.467e+04   0.709 0.478693    
## SaleTypeWD           -7.511e+02  4.312e+03  -0.174 0.861755    
## SaleConditionAdjLand  1.412e+04  1.611e+04   0.876 0.380995    
## SaleConditionAlloca   1.163e+04  9.993e+03   1.164 0.244843    
## SaleConditionFamily  -1.889e+03  6.157e+03  -0.307 0.759014    
## SaleConditionNormal   5.408e+03  2.942e+03   1.838 0.066253 .  
## SaleConditionPartial -2.839e+03  1.517e+04  -0.187 0.851561    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23110 on 1202 degrees of freedom
##   (48 observations deleted due to missingness)
## Multiple R-squared:  0.9276, Adjusted R-squared:  0.915 
## F-statistic: 73.71 on 209 and 1202 DF,  p-value: < 2.2e-16
#Due to strong collinearlity between variables, we will omit some variables with Estimate NA.
#Not only that, for simplicity, we will will focus on few quantitative independent variables with p value < 0.05 only.
#I will choose 1 continous independent variable and 1 categorical variable.
data_good_2 <- data_good[c("SalePrice", "LotArea", "Street")]
head(data_good_2)
##   SalePrice LotArea Street
## 1    208500    8450   Pave
## 2    181500    9600   Pave
## 3    223500   11250   Pave
## 4    140000    9550   Pave
## 5    250000   14260   Pave
## 6    143000   14115   Pave
model_2 <- lm(SalePrice ~ LotArea + Street, data=data_good_2)
plot(model_2)

#Model doesn't seem to fit well. Let's transform variables using Box-Cox
trans <- boxcox(model_2)

trans
## $x
##   [1] -2.00000000 -1.95959596 -1.91919192 -1.87878788 -1.83838384
##   [6] -1.79797980 -1.75757576 -1.71717172 -1.67676768 -1.63636364
##  [11] -1.59595960 -1.55555556 -1.51515152 -1.47474747 -1.43434343
##  [16] -1.39393939 -1.35353535 -1.31313131 -1.27272727 -1.23232323
##  [21] -1.19191919 -1.15151515 -1.11111111 -1.07070707 -1.03030303
##  [26] -0.98989899 -0.94949495 -0.90909091 -0.86868687 -0.82828283
##  [31] -0.78787879 -0.74747475 -0.70707071 -0.66666667 -0.62626263
##  [36] -0.58585859 -0.54545455 -0.50505051 -0.46464646 -0.42424242
##  [41] -0.38383838 -0.34343434 -0.30303030 -0.26262626 -0.22222222
##  [46] -0.18181818 -0.14141414 -0.10101010 -0.06060606 -0.02020202
##  [51]  0.02020202  0.06060606  0.10101010  0.14141414  0.18181818
##  [56]  0.22222222  0.26262626  0.30303030  0.34343434  0.38383838
##  [61]  0.42424242  0.46464646  0.50505051  0.54545455  0.58585859
##  [66]  0.62626263  0.66666667  0.70707071  0.74747475  0.78787879
##  [71]  0.82828283  0.86868687  0.90909091  0.94949495  0.98989899
##  [76]  1.03030303  1.07070707  1.11111111  1.15151515  1.19191919
##  [81]  1.23232323  1.27272727  1.31313131  1.35353535  1.39393939
##  [86]  1.43434343  1.47474747  1.51515152  1.55555556  1.59595960
##  [91]  1.63636364  1.67676768  1.71717172  1.75757576  1.79797980
##  [96]  1.83838384  1.87878788  1.91919192  1.95959596  2.00000000
## 
## $y
##   [1] -4948.093 -4903.714 -4860.290 -4817.832 -4776.348 -4735.848 -4696.343
##   [8] -4657.838 -4620.341 -4583.856 -4548.388 -4513.940 -4480.516 -4448.115
##  [15] -4416.738 -4386.385 -4357.054 -4328.742 -4301.447 -4275.162 -4249.885
##  [22] -4225.609 -4202.327 -4180.033 -4158.719 -4138.378 -4119.000 -4100.578
##  [29] -4083.102 -4066.564 -4050.953 -4036.261 -4022.477 -4009.593 -3997.599
##  [36] -3986.485 -3976.242 -3966.862 -3958.335 -3950.652 -3943.806 -3937.788
##  [43] -3932.590 -3928.205 -3924.625 -3921.845 -3919.857 -3918.656 -3918.235
##  [50] -3918.590 -3919.715 -3921.607 -3924.261 -3927.673 -3931.839 -3936.758
##  [57] -3942.426 -3948.842 -3956.002 -3963.906 -3972.553 -3981.941 -3992.069
##  [64] -4002.938 -4014.548 -4026.898 -4039.990 -4053.823 -4068.399 -4083.718
##  [71] -4099.783 -4116.593 -4134.152 -4152.460 -4171.519 -4191.332 -4211.899
##  [78] -4233.223 -4255.306 -4278.150 -4301.755 -4326.125 -4351.261 -4377.164
##  [85] -4403.836 -4431.278 -4459.492 -4488.477 -4518.236 -4548.767 -4580.072
##  [92] -4612.150 -4645.002 -4678.625 -4713.020 -4748.184 -4784.117 -4820.815
##  [99] -4858.277 -4896.500
trans_df <- as.data.frame(trans)

#Find optima lambda
optimal_lambda <- trans_df[which.max(trans$y),1]

#Let's combine the results and make dataframe of the results
transdata = cbind(data_good_2,              data_good_2$LotArea^optimal_lambda, data$Street^optimal_lambda,
data_good_2$SalePrice^optimal_lambda)
## Warning in Ops.factor(data$Street, optimal_lambda): '^' not meaningful for
## factors
names(transdata)[4] = "LotArea_boxcox"
names(transdata)[5] = "Street_boxcox"
names(transdata)[6] = "SalePrice_boxcox"

#Let's see the transformed data and summary of it
head(transdata)
##   SalePrice LotArea Street LotArea_boxcox Street_boxcox SalePrice_boxcox
## 1    208500    8450   Pave      0.5781076            NA        0.4760252
## 2    181500    9600   Pave      0.5736543            NA        0.4800431
## 3    223500   11250   Pave      0.5681665            NA        0.4740251
## 4    140000    9550   Pave      0.5738358            NA        0.4876559
## 5    250000   14260   Pave      0.5600608            NA        0.4708170
## 6    143000   14115   Pave      0.5604078            NA        0.4870296
summary(transdata)
##    SalePrice         LotArea        Street     LotArea_boxcox  
##  Min.   : 34900   Min.   :  1300   Grvl:   6   Min.   :0.4751  
##  1st Qu.:129975   1st Qu.:  7554   Pave:1454   1st Qu.:0.5671  
##  Median :163000   Median :  9478               Median :0.5741  
##  Mean   :180921   Mean   : 10517               Mean   :0.5760  
##  3rd Qu.:214000   3rd Qu.: 11602               3rd Qu.:0.5821  
##  Max.   :755000   Max.   :215245               Max.   :0.6476  
##  Street_boxcox  SalePrice_boxcox
##  Mode:logical   Min.   :0.4403  
##  NA's:1460      1st Qu.:0.4753  
##                 Median :0.4832  
##                 Mean   :0.4827  
##                 3rd Qu.:0.4899  
##                 Max.   :0.5305
#Apparently, since Street is a categorical varialbe, Box-Cox transform is not needed. Summary shows all NAs for Street varible. We will use normal Street variable and use LotArea_boxcox for our model

#Let's see the histogram of transformed variables
hist(transdata$SalePrice_boxcox, col = "blue", main = "Historgram of SalePrice Transformed by Box-Cox")

hist(transdata$LotArea_boxcox, col = "blue", main = "Historgram of LotArea Transformed by Box-Cox")

#Transformed variables look more or less normally distributed
#Residual vs fitted and QQ plot shows the model is normally distributed
model_3 <- lm(SalePrice_boxcox ~ LotArea_boxcox + Street, data=transdata)
summary(model_3)
## 
## Call:
## lm(formula = SalePrice_boxcox ~ LotArea_boxcox + Street, data = transdata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.036932 -0.007109  0.000693  0.006594  0.047546 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.349537   0.009521  36.711  < 2e-16 ***
## LotArea_boxcox  0.262375   0.015462  16.969  < 2e-16 ***
## StreetPave     -0.018072   0.004380  -4.126  3.9e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01065 on 1457 degrees of freedom
## Multiple R-squared:  0.1679, Adjusted R-squared:  0.1667 
## F-statistic:   147 on 2 and 1457 DF,  p-value: < 2.2e-16
plot(model_3)

#From model_3 summary, we know that both LotArea and Street have marginal impact on SalePrice with p value < 0.05.
#We know that 1 unit increase in LotArea_boxcox increases SalePrice_boxcox by 0.262375.
#When Stree = Pave, SalePrice_boxcox tend to decrease by 0.018.
#We reject null hypothesis as independent variables are statistically significant; they do have impact on dependent variable

#Now, let's perform prediction on test data sets
test <- read.csv("https://raw.githubusercontent.com/wheremagichappens/an.dy/master/data605/test.csv")

#To predict SalePrice, we need to change the name of variables from LotArea_boxcox and SalePrice_boxcox to LotArea and SalePrice
names(transdata)[4] = "LotArea"
names(transdata)[6] = "SalePrice"
transdata <- transdata[c(3,4,6)]

#Not only that, we need to transform LotArea variable in test set to box-cox as well
test$LotArea <- test$LotArea^optimal_lambda

#Re-run the model
model_3 <- lm(SalePrice ~ LotArea   + Street, data=transdata)
summary(model_3)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + Street, data = transdata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.036932 -0.007109  0.000693  0.006594  0.047546 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.349537   0.009521  36.711  < 2e-16 ***
## LotArea      0.262375   0.015462  16.969  < 2e-16 ***
## StreetPave  -0.018072   0.004380  -4.126  3.9e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01065 on 1457 degrees of freedom
## Multiple R-squared:  0.1679, Adjusted R-squared:  0.1667 
## F-statistic:   147 on 2 and 1457 DF,  p-value: < 2.2e-16
plot(model_3)

#Create dataframe for prediction and show results
pred_SalePrice <- predict(model_3,test)
prediction <- data.frame(Id = test[,"Id"],  SalePrice = pred_SalePrice)
prediction[prediction<0] <- 0
prediction <- replace(prediction,is.na(prediction),0)

head(prediction)
##     Id SalePrice
## 1 1461 0.4802441
## 2 1462 0.4784066
## 3 1463 0.4786839
## 4 1464 0.4816257
## 5 1465 0.4880377
## 6 1466 0.4816056
prediction$SalePrice <- prediction$SalePrice^(1/optimal_lambda)

head(prediction)
##     Id SalePrice
## 1 1461  180250.7
## 2 1462  192020.2
## 3 1463  190193.0
## 4 1464  171906.1
## 5 1465  138203.4
## 6 1466  172024.1
#Note that SalePrice is in box-cox so we transform it back to original value
#Of course, the prediction is based on only few independent variables but still meaningful.
write.csv(prediction, "myprediction.csv")

My kaggle user name is an11dy and my score was 0.39593.