library(tidyverse)
library(cubature)
library(expm)
library(igraph)
library(readr)
library(Rfast)
library(nnet)
library(corrplot)
library(ggrepel)
library(Matrix)
library(MASS)
library(Metrics)
library(OpenImageR)
library(caret)

1.PageRank

Form the A matrix. Then, introduce decay and form the B matrix as we did in the course notes.

A<-matrix(c(0,0,1/3,0,0,0,
            1/2,0,1/3,0,0,0,
            1/2,0,0,0,0,0,
            0,0,0,0,1/2,1,
            0,0,1/3,1/2,0,0,
            0,0,0,1/2,1/2,0),6)
A
##           [,1]      [,2] [,3] [,4]      [,5] [,6]
## [1,] 0.0000000 0.5000000  0.5  0.0 0.0000000  0.0
## [2,] 0.0000000 0.0000000  0.0  0.0 0.0000000  0.0
## [3,] 0.3333333 0.3333333  0.0  0.0 0.3333333  0.0
## [4,] 0.0000000 0.0000000  0.0  0.0 0.5000000  0.5
## [5,] 0.0000000 0.0000000  0.0  0.5 0.0000000  0.5
## [6,] 0.0000000 0.0000000  0.0  1.0 0.0000000  0.0
B<-0.85*A+(0.15/6)
B
##           [,1]      [,2]  [,3]  [,4]      [,5]  [,6]
## [1,] 0.0250000 0.4500000 0.450 0.025 0.0250000 0.025
## [2,] 0.0250000 0.0250000 0.025 0.025 0.0250000 0.025
## [3,] 0.3083333 0.3083333 0.025 0.025 0.3083333 0.025
## [4,] 0.0250000 0.0250000 0.025 0.025 0.4500000 0.450
## [5,] 0.0250000 0.0250000 0.025 0.450 0.0250000 0.450
## [6,] 0.0250000 0.0250000 0.025 0.875 0.0250000 0.025

Start with a uniform rank vector r and perform power iterations on B till convergence. That is, compute the solution r = Bn × r. Attempt this for a sufficiently large n so that r actually converges. (5 Points)

r_i<-c(1/6,1/6,1/6,1/6,1/6,1/6)
r_i
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
r_f<-(B%^%5)%*%r_i
r_f
##            [,1]
## [1,] 0.06055435
## [2,] 0.01558875
## [3,] 0.08188545
## [4,] 0.14329837
## [5,] 0.14329837
## [6,] 0.14329837
powerIteration <- function(A, r, n) {
  Iter = diag(dim(A)[1])
  for (i in 1:n)
  {
    Iter = Iter %*% A
  }
  return (Iter %*% r)
}

powerIteration(A, r_i, 5)
##            [,1]
## [1,] 0.03703704
## [2,] 0.00000000
## [3,] 0.06944444
## [4,] 0.16666667
## [5,] 0.16666667
## [6,] 0.16666667
powerIteration(A, r_i, 15)
##            [,1]
## [1,] 0.03333381
## [2,] 0.00000000
## [3,] 0.06666702
## [4,] 0.16666667
## [5,] 0.16666667
## [6,] 0.16666667
powerIteration(A, r_i, 25)
##            [,1]
## [1,] 0.03333333
## [2,] 0.00000000
## [3,] 0.06666667
## [4,] 0.16666667
## [5,] 0.16666667
## [6,] 0.16666667
powerIteration(B, r_i, 5)
##            [,1]
## [1,] 0.06055435
## [2,] 0.01558875
## [3,] 0.08188545
## [4,] 0.14329837
## [5,] 0.14329837
## [6,] 0.14329837
powerIteration(B, r_i, 15)
##            [,1]
## [1,] 0.03524057
## [2,] 0.00934132
## [3,] 0.04865137
## [4,] 0.08745120
## [5,] 0.08745120
## [6,] 0.08745120
powerIteration(B, r_i, 125)
##              [,1]
## [1,] 0.0001512418
## [2,] 0.0000400904
## [3,] 0.0002087980
## [4,] 0.0003753185
## [5,] 0.0003753185
## [6,] 0.0003753185

The rank are in the descending order of 4,5,6,3,1,2. 4, 5 and 6 are in the same rank. The order is the same even n=100, as such it’s convergence, but the numerical number goes to zero.

Compute the eigen-decomposition of B and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.(10 points)

eigen(B)
## eigen() decomposition
## $values
## [1]  0.95165271+0i -0.42500000+0i -0.42500000-0i  0.41436582+0i -0.34733611+0i
## [6] -0.01868242+0i
## 
## $vectors
##               [,1]                        [,2]                        [,3]
## [1,] -0.2159123+0i -5.345225e-01-0.000000e+00i -5.345225e-01+0.000000e+00i
## [2,] -0.0572329+0i -5.200810e-17+3.664536e-10i -5.200810e-17-3.664536e-10i
## [3,] -0.2980792+0i  5.345225e-01+0.000000e+00i  5.345225e-01+0.000000e+00i
## [4,] -0.5358032+0i -2.672612e-01+0.000000e+00i -2.672612e-01-0.000000e+00i
## [5,] -0.5358032+0i -2.672612e-01+0.000000e+00i -2.672612e-01-0.000000e+00i
## [6,] -0.5358032+0i  5.345225e-01-0.000000e+00i  5.345225e-01+0.000000e+00i
##                [,4]            [,5]           [,6]
## [1,] -0.77869913+0i -0.775079691+0i  0.55060201+0i
## [2,] -0.07537631+0i  0.009090375+0i -0.61511270+0i
## [3,] -0.61034825+0i  0.631781588+0i  0.56386946+0i
## [4,]  0.07169632+0i  0.002637034+0i -0.01322899+0i
## [5,]  0.07169632+0i  0.002637034+0i -0.01322899+0i
## [6,]  0.07169632+0i  0.002637034+0i -0.01322899+0i
unit_Vector<-function(v){
  len<-sqrt(sum(v^2))
  return (v/len)
}
graphObj <- graph.adjacency(A, weighted = TRUE, mode = "directed")
plot(graphObj)

#Compute page rank using the igraph
prVec <- page.rank(graphObj)$vector

#Compute the unit vector to compare with earlier results
matrix(unit_Vector(prVec), nrow=6, byrow=TRUE)
##           [,1]
## [1,] 0.1044385
## [2,] 0.1488249
## [3,] 0.1159674
## [4,] 0.7043472
## [5,] 0.4037861
## [6,] 0.5425377

2.MNIST

temp<-tempfile()
download.file("https://github.com/ferrysany/data605/raw/main/train.csv.zip", temp)
mnist_train<-read_csv(unzip(temp, "train.csv"))
dim(mnist_train)
## [1] 42000   785
  1. Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)
mt_pixel<-mnist_train[,2:785]/255
mt_pixel$label<-mnist_train$label

for (i in 1:10){
  
  digi<- matrix(as.numeric(mt_pixel[i,1:(ncol(mt_pixel)-1)]),nrow=28)
  digi<-digi[,ncol(digi):1]
  image(digi, col=grey.colors(255), axes=FALSE, xlim=c(0,1), ylim=c(-0.2,1.2))
  
}

mnist_train[1:10,1]
  1. What is the frequency distribution of the numbers in the dataset? (5 points)
ggplot(mt_pixel, aes(x=label))+
  geom_histogram()+
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10))+
  labs(title = 'Frequency Distribution of Numbers', y='Frequency', x='Number')

  1. For each number, provide the mean pixel intensity. What does this tell you? (5 points)
pix_intensity<-mt_pixel %>%
  mutate(psum=rowSums(across(starts_with('pixel'))), na.rm=T)%>%
  dplyr::select(label, psum)%>%
  group_by(label)%>%
  summarize(mean_pixel=mean(psum))
pix_intensity

The data shows that people draw curves with high intensity.

6.Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?

pca_pixel<-mt_pixel%>%
  dplyr::select(-label)
mypca<-prcomp(pca_pixel)
mypca_var<-cumsum((mypca$sdev)^2)/sum((mypca$sdev)^2)
plot(mypca_var)

It looks like 95% is close to 180 index

which.max(mypca_var>=.95)
## [1] 154
which.max(mypca_var>=1.0)
## [1] 704

The actual 95% is at 154.

  1. Plot the first 10 images generated by PCA. They will appear to be noise. Why?
for(j in 1:10){
  plot(4,4, xlim=c(0,28), ylim=c(0,28))
  imageShow(array(mypca$rotation[,j],c(28,28)))
}

  1. Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see?
pca_pixel_8<-mt_pixel%>%
  filter(label=='8')%>%
  dplyr::select(-label)
mypca_8<-prcomp(pca_pixel_8)
mypca_var_8<-cumsum((mypca_8$sdev)^2)/sum((mypca_8$sdev)^2)
plot(mypca_var_8)

which.max(mypca_var_8>=1.0)
## [1] 537
for(j in 1:10){
  plot(4,4, xlim=c(0,28), ylim=c(0,28))
  imageShow(array(mypca_8$rotation[,j],c(28,28)))
}

It only requires 537 to achieve 100% variance. The figure 8 in horizontal can be seen.

  1. An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)
labels = mnist_train[,1]

test_size <- floor(.2 * nrow(mnist_train))
set.seed(9999)
test_index <- sample(seq_len(nrow(mt_pixel)), size = test_size)

train_df <- mt_pixel[-test_index,]
train_labels <- c(labels[-test_index,])[[1]]

test_df <- mt_pixel[test_index,]
test_labels <- c(labels[test_index,])[[1]]

mm<-multinom(train_labels~., data=train_df, MaxNWts=8000)
## # weights:  7870 (7074 variable)
## initial  value 77366.859125 
## iter  10 value 13777.297553
## iter  20 value 8741.917348
## iter  30 value 7812.464535
## iter  40 value 7156.458278
## iter  50 value 6334.186620
## iter  60 value 5445.048983
## iter  70 value 4559.672206
## iter  80 value 4123.623320
## iter  90 value 3812.516435
## iter 100 value 3569.730302
## final  value 3569.730302 
## stopped after 100 iterations
predictions <- predict(mm, test_df)
test_labels <- as.factor(test_labels)

confusionMatrix(predictions,test_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8   9
##          0 812   0   0   0   0   0   0   0   0   0
##          1   3 863  15   3   0   0   0   0   0   0
##          2   1   1 783  28  10   0   0   0   0   0
##          3   0   1  24 803   7  12   0   0   0   0
##          4   0   0  21   4 832  11   2   1   0   0
##          5   0   0   0  10   5 668  17   3  14   1
##          6   0   0   8   2  15  12 779   0   7   0
##          7   0   0   2   2   3   8   2 866   5  13
##          8   0   0   0   7   1  22   3  11 784   9
##          9   0   0   0   0   1   4   2  17  25 835
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9554          
##                  95% CI : (0.9507, 0.9597)
##     No Information Rate : 0.1069          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9504          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.99510   0.9977  0.91794   0.9348  0.95195  0.90638
## Specificity           1.00000   0.9972  0.99470   0.9942  0.99482  0.99348
## Pos Pred Value        1.00000   0.9762  0.95140   0.9481  0.95522  0.93036
## Neg Pred Value        0.99947   0.9997  0.99076   0.9926  0.99442  0.99102
## Prevalence            0.09714   0.1030  0.10155   0.1023  0.10405  0.08774
## Detection Rate        0.09667   0.1027  0.09321   0.0956  0.09905  0.07952
## Detection Prevalence  0.09667   0.1052  0.09798   0.1008  0.10369  0.08548
## Balanced Accuracy     0.99755   0.9975  0.95632   0.9645  0.97338  0.94993
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.96770   0.9644  0.93892   0.9732
## Specificity           0.99421   0.9953  0.99299   0.9935
## Pos Pred Value        0.94654   0.9612  0.93668   0.9446
## Neg Pred Value        0.99657   0.9957  0.99326   0.9969
## Prevalence            0.09583   0.1069  0.09940   0.1021
## Detection Rate        0.09274   0.1031  0.09333   0.0994
## Detection Prevalence  0.09798   0.1073  0.09964   0.1052
## Balanced Accuracy     0.98095   0.9798  0.96596   0.9833

3. House Prices

You are to compete in the House Prices: Advanced Regression Techniques competition https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points

housing<-read_csv("https://raw.githubusercontent.com/ferrysany/CUNY-Data621/main/train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(housing)
## [1] 1460   81
#Select LotArea, OverallCond and Yearbuilt as the independent variables for univariate descriptive statistics and SalePrice  as the dependent variable.
housing_select<-housing%>%
  dplyr::select('LotArea', 'YearBuilt', 'OverallCond', 'SalePrice')

summary(housing_select)
##     LotArea         YearBuilt     OverallCond      SalePrice     
##  Min.   :  1300   Min.   :1872   Min.   :1.000   Min.   : 34900  
##  1st Qu.:  7554   1st Qu.:1954   1st Qu.:5.000   1st Qu.:129975  
##  Median :  9478   Median :1973   Median :5.000   Median :163000  
##  Mean   : 10517   Mean   :1971   Mean   :5.575   Mean   :180921  
##  3rd Qu.: 11602   3rd Qu.:2000   3rd Qu.:6.000   3rd Qu.:214000  
##  Max.   :215245   Max.   :2010   Max.   :9.000   Max.   :755000
housing_select_cor<-cor(housing_select)
housing_select_cor
##                 LotArea   YearBuilt OverallCond   SalePrice
## LotArea      1.00000000  0.01422765 -0.00563627  0.26384335
## YearBuilt    0.01422765  1.00000000 -0.37598320  0.52289733
## OverallCond -0.00563627 -0.37598320  1.00000000 -0.07785589
## SalePrice    0.26384335  0.52289733 -0.07785589  1.00000000
corrplot(housing_select_cor, method='color', type='lower', sig.level=0.01)

y<-housing_select$SalePrice

housing_plot<-function(x, y){
  d<-as.data.frame(cbind(x,y))
  ggplot(d, aes(x=x, y=y))+
    geom_point()+
    geom_smooth(method=lm)
    #geom_text_repel(aes(label=ifelse(.data[[i]]<=cutoff, Name, "")))
}

for(i in 1:(ncol(housing_select)-1)){
  x<-housing_select[[i]]
  cor_result<-cor.test(x, y, method='pearson', conf.level=0.8)
  print(housing_plot(x,y))
  print(paste('Hypothesis test for the correlations', colnames(housing_select)[i], 'and SalePrice'))
  print(cor_result)
}
## `geom_smooth()` using formula = 'y ~ x'

## [1] "Hypothesis test for the correlations LotArea and SalePrice"
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434
## `geom_smooth()` using formula = 'y ~ x'

## [1] "Hypothesis test for the correlations YearBuilt and SalePrice"
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4980766 0.5468619
## sample estimates:
##       cor 
## 0.5228973
## `geom_smooth()` using formula = 'y ~ x'

## [1] "Hypothesis test for the correlations OverallCond and SalePrice"
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.1111272 -0.0444103
## sample estimates:
##         cor 
## -0.07785589

LotArea - the hypothesis that the the correlations is zero can be rejected Yearbuild - the hypothesis that the the correlations is zero can be rejected OverallCond - the hypothesis that the the correlations is zero can be rejected

The concept of a familywise error rate as the probability of making a Type I error among a specified group. It is not likely to have type I error because the P values are relatively low and the 80% intervals don’t include zero.

Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points

p<-solve(housing_select_cor)
print('The precision matrix is')
## [1] "The precision matrix is"
p
##                 LotArea  YearBuilt OverallCond  SalePrice
## LotArea      1.10230050  0.2132917  0.05541643 -0.3980498
## YearBuilt    0.21329169  1.6713025  0.56056051 -0.8865523
## OverallCond  0.05541643  0.5605605  1.19435417 -0.2147493
## SalePrice   -0.39804982 -0.8865523 -0.21474934  1.5518791
pcor<-p%*%housing_select_cor
pcor
##                   LotArea     YearBuilt   OverallCond     SalePrice
## LotArea      1.000000e+00 -2.775558e-17 -6.938894e-18  0.000000e+00
## YearBuilt   -8.326673e-17  1.000000e+00  1.110223e-16 -1.110223e-16
## OverallCond -2.081668e-17  0.000000e+00  1.000000e+00  0.000000e+00
## SalePrice    1.110223e-16  0.000000e+00  4.163336e-17  1.000000e+00
corp<-housing_select_cor%*%p
corp
##                   LotArea    YearBuilt  OverallCond     SalePrice
## LotArea      1.000000e+00 0.000000e+00 6.938894e-18  0.000000e+00
## YearBuilt   -2.775558e-17 1.000000e+00 2.775558e-17 -1.110223e-16
## OverallCond -3.469447e-18 9.714451e-17 1.000000e+00 -1.387779e-17
## SalePrice    5.551115e-17 0.000000e+00 5.551115e-17  1.000000e+00
#LU decomposition for correlation matrix
lu_hc<-lu(housing_select_cor)
expand(lu_hc)
## $L
## 4 x 4 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]        [,2]        [,3]        [,4]       
## [1,]  1.00000000           .           .           .
## [2,]  0.01422765  1.00000000           .           .
## [3,] -0.00563627 -0.37597911  1.00000000           .
## [4,]  0.26384335  0.51924857  0.13838020  1.00000000
## 
## $U
## 4 x 4 Matrix of class "dtrMatrix"
##      [,1]        [,2]        [,3]        [,4]       
## [1,]  1.00000000  0.01422765 -0.00563627  0.26384335
## [2,]           .  0.99979757 -0.37590300  0.51914346
## [3,]           .           .  0.85863655  0.11881830
## [4,]           .           .           .  0.64438008
## 
## $P
## 4 x 4 sparse Matrix of class "pMatrix"
##             
## [1,] | . . .
## [2,] . | . .
## [3,] . . | .
## [4,] . . . |

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/Rdevel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.

ggplot(housing_select, aes(SalePrice))+
  geom_histogram()

Select SalePrice as the right skewed data.

exp_fit<-fitdistr(housing_select$SalePrice, 'exponential')

set.seed(9898)
exp_dist<-rexp(1000, exp_fit$estimate)
hist(exp_dist)

It looks more like an exponential distribution.

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points

#5th and 95th percentiles using the cumulative distribution function (CDF)
qexp(c(0.05,0.95), rate=exp_fit$estimate)
## [1]   9280.044 541991.465
#95% confidence interval from the empirical data
qnorm(c(0.025, 0.975), mean = mean(housing_select$SalePrice), sd = sd(housing_select$SalePrice))
## [1]  25216.75 336625.64
#Empirical 5th percentile and 95th percentile of the data
quantile(housing_select$SalePrice, c(0.05, 0.95))
##     5%    95% 
##  88000 326100

The result from CDF is very different from the actual quantile of the empirical data. The exponential model is incorrect. The normal distribution is closer to the actual at 97.5% but it is much smaller at the other end. The empirical data is not a normal distribution function. Other transformation may require to obtain a better model.

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. 10 point

I did similar analysis in Data 621 class and borrow the information from my previous work。Several models were tested and only the final one is shown below:

Simplify version

housing_data <- housing

summary(housing_data)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##     2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
#Correlation Plot
housing_datadd<-housing_data%>%
  dplyr::select(where(is.numeric))
corrplot(cor(housing_datadd), method='color', type='lower', sig.level=0.01)

#rename(FrtflrSF="1stFlrSF", SecFlrSF="2ndFlrSF", ThdSsnPorch="3SsnPorch")

hd_lmod6<-housing_data%>%
  rename(FrtflrSF="1stFlrSF", SecFlrSF="2ndFlrSF", ThdSsnPorch="3SsnPorch")

lmod6_loga<-lm(log(SalePrice) ~ MSZoning + LotArea + Street + LotConfig + 
    LandSlope + Neighborhood + Condition1 + Condition2 + BldgType + 
    OverallQual + OverallCond + YearBuilt + YearRemodAdd + RoofStyle + 
    RoofMatl + Exterior1st + MasVnrType + BsmtQual + BsmtExposure + 
    BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + FrtflrSF + SecFlrSF + 
    KitchenAbvGr + KitchenQual + Functional + Fireplaces + GarageCars + 
    GarageArea + GarageQual + GarageCond + WoodDeckSF + ScreenPorch + 
    PoolArea + SaleType, data = hd_lmod6)

summary(lmod6_loga)
## 
## Call:
## lm(formula = log(SalePrice) ~ MSZoning + LotArea + Street + LotConfig + 
##     LandSlope + Neighborhood + Condition1 + Condition2 + BldgType + 
##     OverallQual + OverallCond + YearBuilt + YearRemodAdd + RoofStyle + 
##     RoofMatl + Exterior1st + MasVnrType + BsmtQual + BsmtExposure + 
##     BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + FrtflrSF + SecFlrSF + 
##     KitchenAbvGr + KitchenQual + Functional + Fireplaces + GarageCars + 
##     GarageArea + GarageQual + GarageCond + WoodDeckSF + ScreenPorch + 
##     PoolArea + SaleType, data = hd_lmod6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74496 -0.04599  0.00366  0.05176  0.74496 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.169e+00  7.393e-01   1.581 0.114180    
## MSZoningFV           4.974e-01  5.448e-02   9.131  < 2e-16 ***
## MSZoningRH           4.093e-01  5.677e-02   7.210 9.81e-13 ***
## MSZoningRL           4.587e-01  4.698e-02   9.762  < 2e-16 ***
## MSZoningRM           4.002e-01  4.394e-02   9.108  < 2e-16 ***
## LotArea              3.392e-06  4.289e-07   7.908 5.85e-15 ***
## StreetPave           1.540e-01  5.488e-02   2.806 0.005094 ** 
## LotConfigCulDSac     1.751e-02  1.372e-02   1.276 0.202043    
## LotConfigFR2        -3.318e-02  1.781e-02  -1.863 0.062723 .  
## LotConfigFR3        -3.645e-02  5.522e-02  -0.660 0.509232    
## LotConfigInside     -7.968e-03  7.723e-03  -1.032 0.302451    
## LandSlopeMod         2.239e-02  1.554e-02   1.441 0.149915    
## LandSlopeSev        -2.526e-01  4.753e-02  -5.315 1.27e-07 ***
## NeighborhoodBlueste -4.067e-02  8.146e-02  -0.499 0.617725    
## NeighborhoodBrDale  -4.722e-02  4.678e-02  -1.009 0.313024    
## NeighborhoodBrkSide  5.804e-02  4.110e-02   1.412 0.158183    
## NeighborhoodClearCr  2.553e-02  3.954e-02   0.646 0.518621    
## NeighborhoodCollgCr -6.924e-03  3.043e-02  -0.228 0.820065    
## NeighborhoodCrawfor  1.186e-01  3.604e-02   3.292 0.001024 ** 
## NeighborhoodEdwards -7.442e-02  3.412e-02  -2.181 0.029379 *  
## NeighborhoodGilbert -3.470e-03  3.220e-02  -0.108 0.914191    
## NeighborhoodIDOTRR   2.643e-02  4.730e-02   0.559 0.576350    
## NeighborhoodMeadowV -1.784e-01  4.914e-02  -3.630 0.000295 ***
## NeighborhoodMitchel -3.044e-02  3.479e-02  -0.875 0.381875    
## NeighborhoodNAmes   -2.945e-02  3.266e-02  -0.902 0.367339    
## NeighborhoodNoRidge  4.140e-02  3.442e-02   1.203 0.229287    
## NeighborhoodNPkVill  1.108e-02  4.674e-02   0.237 0.812643    
## NeighborhoodNridgHt  7.771e-02  3.104e-02   2.503 0.012437 *  
## NeighborhoodNWAmes  -4.260e-02  3.361e-02  -1.268 0.205178    
## NeighborhoodOldTown  9.060e-03  4.180e-02   0.217 0.828432    
## NeighborhoodSawyer  -1.312e-02  3.454e-02  -0.380 0.704128    
## NeighborhoodSawyerW  1.199e-03  3.333e-02   0.036 0.971311    
## NeighborhoodSomerst  2.126e-02  3.838e-02   0.554 0.579772    
## NeighborhoodStoneBr  1.172e-01  3.471e-02   3.376 0.000760 ***
## NeighborhoodSWISU    3.097e-03  4.152e-02   0.075 0.940541    
## NeighborhoodTimber  -9.370e-04  3.408e-02  -0.027 0.978069    
## NeighborhoodVeenker  4.853e-02  4.484e-02   1.082 0.279328    
## Condition1Feedr      6.623e-02  2.340e-02   2.830 0.004730 ** 
## Condition1Norm       1.015e-01  1.888e-02   5.375 9.17e-08 ***
## Condition1PosA       8.257e-02  4.320e-02   1.912 0.056163 .  
## Condition1PosN       1.030e-01  3.219e-02   3.201 0.001407 ** 
## Condition1RRAe      -2.050e-02  3.963e-02  -0.517 0.605005    
## Condition1RRAn       8.074e-02  3.021e-02   2.673 0.007628 ** 
## Condition1RRNe       7.943e-02  7.664e-02   1.036 0.300229    
## Condition1RRNn       1.444e-01  5.482e-02   2.633 0.008562 ** 
## Condition2Feedr     -1.012e-01  1.008e-01  -1.004 0.315495    
## Condition2Norm      -1.107e-02  8.507e-02  -0.130 0.896475    
## Condition2PosA       2.698e-01  1.365e-01   1.976 0.048343 *  
## Condition2PosN      -9.426e-01  1.180e-01  -7.990 3.12e-15 ***
## Condition2RRAe      -4.929e-01  1.896e-01  -2.600 0.009433 ** 
## Condition2RRAn      -6.588e-02  1.352e-01  -0.487 0.626123    
## Condition2RRNn      -1.172e-01  1.148e-01  -1.021 0.307574    
## BldgType2fmCon      -4.841e-02  2.855e-02  -1.696 0.090206 .  
## BldgTypeDuplex      -7.556e-02  3.243e-02  -2.330 0.019994 *  
## BldgTypeTwnhs       -1.013e-01  2.371e-02  -4.270 2.11e-05 ***
## BldgTypeTwnhsE      -4.201e-02  1.554e-02  -2.703 0.006960 ** 
## OverallQual          4.587e-02  4.430e-03  10.354  < 2e-16 ***
## OverallCond          4.048e-02  3.672e-03  11.023  < 2e-16 ***
## YearBuilt            2.398e-03  3.024e-04   7.930 4.96e-15 ***
## YearRemodAdd         7.711e-04  2.477e-04   3.113 0.001898 ** 
## RoofStyleGable      -2.116e-02  7.849e-02  -0.270 0.787562    
## RoofStyleGambrel    -3.232e-02  8.556e-02  -0.378 0.705740    
## RoofStyleHip        -1.982e-02  7.875e-02  -0.252 0.801313    
## RoofStyleMansard     6.594e-02  9.201e-02   0.717 0.473735    
## RoofStyleShed        4.520e-01  1.480e-01   3.054 0.002311 ** 
## RoofMatlCompShg      2.771e+00  1.315e-01  21.071  < 2e-16 ***
## RoofMatlMembran      3.179e+00  1.961e-01  16.208  < 2e-16 ***
## RoofMatlMetal        3.085e+00  1.927e-01  16.007  < 2e-16 ***
## RoofMatlRoll         2.947e+00  1.712e-01  17.219  < 2e-16 ***
## RoofMatlTar&Grv      2.796e+00  1.533e-01  18.243  < 2e-16 ***
## RoofMatlWdShake      2.686e+00  1.471e-01  18.253  < 2e-16 ***
## RoofMatlWdShngl      2.787e+00  1.370e-01  20.345  < 2e-16 ***
## Exterior1stBrkComm  -4.301e-01  1.140e-01  -3.773 0.000169 ***
## Exterior1stBrkFace   1.135e-01  3.403e-02   3.336 0.000875 ***
## Exterior1stCBlock    3.157e-02  1.091e-01   0.289 0.772283    
## Exterior1stCemntBd   8.221e-02  3.535e-02   2.326 0.020188 *  
## Exterior1stHdBoard   2.375e-02  3.093e-02   0.768 0.442775    
## Exterior1stImStucc   7.778e-02  1.083e-01   0.718 0.472748    
## Exterior1stMetalSd   6.431e-02  3.052e-02   2.107 0.035326 *  
## Exterior1stPlywood   2.042e-02  3.243e-02   0.630 0.529058    
## Exterior1stStone     2.418e-02  8.690e-02   0.278 0.780901    
## Exterior1stStucco    5.700e-02  3.800e-02   1.500 0.133794    
## Exterior1stVinylSd   6.237e-02  3.069e-02   2.032 0.042342 *  
## Exterior1stWd Sdng   3.165e-02  3.033e-02   1.043 0.296981    
## Exterior1stWdShing   2.560e-02  3.806e-02   0.673 0.501286    
## MasVnrTypeBrkFace    6.833e-02  2.920e-02   2.340 0.019429 *  
## MasVnrTypeNone       5.453e-02  2.875e-02   1.897 0.058106 .  
## MasVnrTypeStone      7.245e-02  3.090e-02   2.345 0.019205 *  
## BsmtQualFa          -4.815e-02  2.719e-02  -1.771 0.076867 .  
## BsmtQualGd          -3.690e-02  1.392e-02  -2.652 0.008117 ** 
## BsmtQualTA          -5.091e-02  1.728e-02  -2.945 0.003290 ** 
## BsmtExposureGd       3.189e-02  1.309e-02   2.436 0.014978 *  
## BsmtExposureMn       2.685e-03  1.286e-02   0.209 0.834610    
## BsmtExposureNo      -1.215e-02  8.889e-03  -1.366 0.172110    
## BsmtFinSF1           1.628e-04  2.003e-05   8.125 1.10e-15 ***
## BsmtFinSF2           1.316e-04  2.568e-05   5.124 3.49e-07 ***
## BsmtUnfSF            8.232e-05  1.973e-05   4.173 3.22e-05 ***
## FrtflrSF             2.651e-04  2.098e-05  12.634  < 2e-16 ***
## SecFlrSF             2.619e-04  9.985e-06  26.229  < 2e-16 ***
## KitchenAbvGr        -5.089e-02  2.871e-02  -1.772 0.076586 .  
## KitchenQualFa       -6.974e-02  2.846e-02  -2.450 0.014408 *  
## KitchenQualGd       -5.269e-02  1.439e-02  -3.661 0.000262 ***
## KitchenQualTA       -5.496e-02  1.653e-02  -3.324 0.000914 ***
## FunctionalMaj2      -1.376e-01  6.575e-02  -2.093 0.036539 *  
## FunctionalMin1       2.448e-02  4.017e-02   0.609 0.542420    
## FunctionalMin2       2.954e-02  3.987e-02   0.741 0.458885    
## FunctionalMod       -5.039e-02  4.984e-02  -1.011 0.312209    
## FunctionalSev       -3.792e-01  1.197e-01  -3.168 0.001574 ** 
## FunctionalTyp        6.907e-02  3.504e-02   1.971 0.048907 *  
## Fireplaces           2.566e-02  5.810e-03   4.416 1.09e-05 ***
## GarageCars           2.516e-02  9.846e-03   2.556 0.010721 *  
## GarageArea           1.046e-04  3.239e-05   3.229 0.001276 ** 
## GarageQualFa        -4.389e-01  1.199e-01  -3.662 0.000261 ***
## GarageQualGd        -3.780e-01  1.230e-01  -3.074 0.002159 ** 
## GarageQualPo        -5.213e-01  1.474e-01  -3.536 0.000422 ***
## GarageQualTA        -4.065e-01  1.187e-01  -3.425 0.000635 ***
## GarageCondFa         3.108e-01  1.415e-01   2.196 0.028255 *  
## GarageCondGd         3.734e-01  1.447e-01   2.581 0.009982 ** 
## GarageCondPo         4.240e-01  1.520e-01   2.790 0.005361 ** 
## GarageCondTA         3.711e-01  1.398e-01   2.654 0.008066 ** 
## WoodDeckSF           7.449e-05  2.549e-05   2.922 0.003541 ** 
## ScreenPorch          1.900e-04  5.262e-05   3.610 0.000319 ***
## PoolArea             1.330e-04  7.536e-05   1.765 0.077802 .  
## SaleTypeCon          1.074e-01  7.710e-02   1.393 0.163740    
## SaleTypeConLD        1.444e-01  4.858e-02   2.973 0.003011 ** 
## SaleTypeConLI       -5.701e-02  5.564e-02  -1.025 0.305766    
## SaleTypeConLw        2.232e-02  5.537e-02   0.403 0.686959    
## SaleTypeCWD          4.836e-02  5.529e-02   0.875 0.381927    
## SaleTypeNew          4.994e-02  2.121e-02   2.355 0.018692 *  
## SaleTypeOth          7.890e-02  1.036e-01   0.761 0.446644    
## SaleTypeWD           6.764e-03  1.734e-02   0.390 0.696500    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1014 on 1209 degrees of freedom
##   (因为不存在,120个观察量被删除了)
## Multiple R-squared:  0.9346, Adjusted R-squared:  0.9276 
## F-statistic: 132.9 on 130 and 1209 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lmod6_loga)

sqrt(sum(lmod6_loga$residuals^2) / lmod6_loga$df)
## [1] 0.101422
rmse(hd_lmod6$SalePrice, exp(lmod6_loga$fitted.values))
## [1] 111218.1

#r Predict

test_housing<-read_csv("https://raw.githubusercontent.com/ferrysany/CUNY-Data621/main/test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(test_housing)
## [1] 1459   80
test_housing_data <- test_housing %>%
  rename(FrtflrSF="1stFlrSF", SecFlrSF="2ndFlrSF", ThdSsnPorch="3SsnPorch")

# test_housing_data1<-test_housing_data1%>%
#   dplyr::select(MSZoning, LotArea, Street, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle,  RoofMatl, Exterior1st, MasVnrType, BsmtQual, BsmtExposure, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, FrtflrSF, SecFlrSF, KitchenAbvGr, KitchenQual,Functional,Fireplaces, GarageCars, GarageArea, GarageQual, GarageCond, WoodDeckSF, ScreenPorch, PoolArea, SaleType) 

test_housing_data1<-test_housing_data
test_housing_data1$Exterior1st[which(!(test_housing_data1$Exterior1st %in% unique(hd_lmod6$Exterior1st)))]<-NA

#There is an error when I run the code below, I can't figure out how to remove the extra level in one of the variables.
#response_predict <- predict(lmod6_log, test_housing_data1, type='response')
#response_predict <- predict(lmod6_loga, test_housing_data1)

#res <- data.frame(Id = test_housing$Id, SalePrice = exp(response_predict))
#write.csv(res, file = "price_step.csv", row.names = FALSE)