library(tidyverse)
library(cubature)
library(expm)
library(igraph)
library(readr)
library(Rfast)
library(nnet)
library(corrplot)
library(ggrepel)
library(Matrix)
library(MASS)
library(Metrics)
library(OpenImageR)
library(caret)
Form the A matrix. Then, introduce decay and form the B matrix as we did in the course notes.
A<-matrix(c(0,0,1/3,0,0,0,
1/2,0,1/3,0,0,0,
1/2,0,0,0,0,0,
0,0,0,0,1/2,1,
0,0,1/3,1/2,0,0,
0,0,0,1/2,1/2,0),6)
A
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0000000 0.5000000 0.5 0.0 0.0000000 0.0
## [2,] 0.0000000 0.0000000 0.0 0.0 0.0000000 0.0
## [3,] 0.3333333 0.3333333 0.0 0.0 0.3333333 0.0
## [4,] 0.0000000 0.0000000 0.0 0.0 0.5000000 0.5
## [5,] 0.0000000 0.0000000 0.0 0.5 0.0000000 0.5
## [6,] 0.0000000 0.0000000 0.0 1.0 0.0000000 0.0
B<-0.85*A+(0.15/6)
B
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0250000 0.4500000 0.450 0.025 0.0250000 0.025
## [2,] 0.0250000 0.0250000 0.025 0.025 0.0250000 0.025
## [3,] 0.3083333 0.3083333 0.025 0.025 0.3083333 0.025
## [4,] 0.0250000 0.0250000 0.025 0.025 0.4500000 0.450
## [5,] 0.0250000 0.0250000 0.025 0.450 0.0250000 0.450
## [6,] 0.0250000 0.0250000 0.025 0.875 0.0250000 0.025
Start with a uniform rank vector r and perform power iterations on B till convergence. That is, compute the solution r = Bn × r. Attempt this for a sufficiently large n so that r actually converges. (5 Points)
r_i<-c(1/6,1/6,1/6,1/6,1/6,1/6)
r_i
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
r_f<-(B%^%5)%*%r_i
r_f
## [,1]
## [1,] 0.06055435
## [2,] 0.01558875
## [3,] 0.08188545
## [4,] 0.14329837
## [5,] 0.14329837
## [6,] 0.14329837
powerIteration <- function(A, r, n) {
Iter = diag(dim(A)[1])
for (i in 1:n)
{
Iter = Iter %*% A
}
return (Iter %*% r)
}
powerIteration(A, r_i, 5)
## [,1]
## [1,] 0.03703704
## [2,] 0.00000000
## [3,] 0.06944444
## [4,] 0.16666667
## [5,] 0.16666667
## [6,] 0.16666667
powerIteration(A, r_i, 15)
## [,1]
## [1,] 0.03333381
## [2,] 0.00000000
## [3,] 0.06666702
## [4,] 0.16666667
## [5,] 0.16666667
## [6,] 0.16666667
powerIteration(A, r_i, 25)
## [,1]
## [1,] 0.03333333
## [2,] 0.00000000
## [3,] 0.06666667
## [4,] 0.16666667
## [5,] 0.16666667
## [6,] 0.16666667
powerIteration(B, r_i, 5)
## [,1]
## [1,] 0.06055435
## [2,] 0.01558875
## [3,] 0.08188545
## [4,] 0.14329837
## [5,] 0.14329837
## [6,] 0.14329837
powerIteration(B, r_i, 15)
## [,1]
## [1,] 0.03524057
## [2,] 0.00934132
## [3,] 0.04865137
## [4,] 0.08745120
## [5,] 0.08745120
## [6,] 0.08745120
powerIteration(B, r_i, 125)
## [,1]
## [1,] 0.0001512418
## [2,] 0.0000400904
## [3,] 0.0002087980
## [4,] 0.0003753185
## [5,] 0.0003753185
## [6,] 0.0003753185
The rank are in the descending order of 4,5,6,3,1,2. 4, 5 and 6 are in the same rank. The order is the same even n=100, as such it’s convergence, but the numerical number goes to zero.
Compute the eigen-decomposition of B and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.(10 points)
eigen(B)
## eigen() decomposition
## $values
## [1] 0.95165271+0i -0.42500000+0i -0.42500000-0i 0.41436582+0i -0.34733611+0i
## [6] -0.01868242+0i
##
## $vectors
## [,1] [,2] [,3]
## [1,] -0.2159123+0i -5.345225e-01-0.000000e+00i -5.345225e-01+0.000000e+00i
## [2,] -0.0572329+0i -5.200810e-17+3.664536e-10i -5.200810e-17-3.664536e-10i
## [3,] -0.2980792+0i 5.345225e-01+0.000000e+00i 5.345225e-01+0.000000e+00i
## [4,] -0.5358032+0i -2.672612e-01+0.000000e+00i -2.672612e-01-0.000000e+00i
## [5,] -0.5358032+0i -2.672612e-01+0.000000e+00i -2.672612e-01-0.000000e+00i
## [6,] -0.5358032+0i 5.345225e-01-0.000000e+00i 5.345225e-01+0.000000e+00i
## [,4] [,5] [,6]
## [1,] -0.77869913+0i -0.775079691+0i 0.55060201+0i
## [2,] -0.07537631+0i 0.009090375+0i -0.61511270+0i
## [3,] -0.61034825+0i 0.631781588+0i 0.56386946+0i
## [4,] 0.07169632+0i 0.002637034+0i -0.01322899+0i
## [5,] 0.07169632+0i 0.002637034+0i -0.01322899+0i
## [6,] 0.07169632+0i 0.002637034+0i -0.01322899+0i
unit_Vector<-function(v){
len<-sqrt(sum(v^2))
return (v/len)
}
graphObj <- graph.adjacency(A, weighted = TRUE, mode = "directed")
plot(graphObj)
#Compute page rank using the igraph
prVec <- page.rank(graphObj)$vector
#Compute the unit vector to compare with earlier results
matrix(unit_Vector(prVec), nrow=6, byrow=TRUE)
## [,1]
## [1,] 0.1044385
## [2,] 0.1488249
## [3,] 0.1159674
## [4,] 0.7043472
## [5,] 0.4037861
## [6,] 0.5425377
temp<-tempfile()
download.file("https://github.com/ferrysany/data605/raw/main/train.csv.zip", temp)
mnist_train<-read_csv(unzip(temp, "train.csv"))
dim(mnist_train)
## [1] 42000 785
mt_pixel<-mnist_train[,2:785]/255
mt_pixel$label<-mnist_train$label
for (i in 1:10){
digi<- matrix(as.numeric(mt_pixel[i,1:(ncol(mt_pixel)-1)]),nrow=28)
digi<-digi[,ncol(digi):1]
image(digi, col=grey.colors(255), axes=FALSE, xlim=c(0,1), ylim=c(-0.2,1.2))
}
mnist_train[1:10,1]
ggplot(mt_pixel, aes(x=label))+
geom_histogram()+
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))+
labs(title = 'Frequency Distribution of Numbers', y='Frequency', x='Number')
pix_intensity<-mt_pixel %>%
mutate(psum=rowSums(across(starts_with('pixel'))), na.rm=T)%>%
dplyr::select(label, psum)%>%
group_by(label)%>%
summarize(mean_pixel=mean(psum))
pix_intensity
The data shows that people draw curves with high intensity.
6.Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why?
pca_pixel<-mt_pixel%>%
dplyr::select(-label)
mypca<-prcomp(pca_pixel)
mypca_var<-cumsum((mypca$sdev)^2)/sum((mypca$sdev)^2)
plot(mypca_var)
It
looks like 95% is close to 180 index
which.max(mypca_var>=.95)
## [1] 154
which.max(mypca_var>=1.0)
## [1] 704
The actual 95% is at 154.
for(j in 1:10){
plot(4,4, xlim=c(0,28), ylim=c(0,28))
imageShow(array(mypca$rotation[,j],c(28,28)))
}
pca_pixel_8<-mt_pixel%>%
filter(label=='8')%>%
dplyr::select(-label)
mypca_8<-prcomp(pca_pixel_8)
mypca_var_8<-cumsum((mypca_8$sdev)^2)/sum((mypca_8$sdev)^2)
plot(mypca_var_8)
which.max(mypca_var_8>=1.0)
## [1] 537
for(j in 1:10){
plot(4,4, xlim=c(0,28), ylim=c(0,28))
imageShow(array(mypca_8$rotation[,j],c(28,28)))
}
It only requires 537 to achieve 100% variance. The figure 8 in
horizontal can be seen.
labels = mnist_train[,1]
test_size <- floor(.2 * nrow(mnist_train))
set.seed(9999)
test_index <- sample(seq_len(nrow(mt_pixel)), size = test_size)
train_df <- mt_pixel[-test_index,]
train_labels <- c(labels[-test_index,])[[1]]
test_df <- mt_pixel[test_index,]
test_labels <- c(labels[test_index,])[[1]]
mm<-multinom(train_labels~., data=train_df, MaxNWts=8000)
## # weights: 7870 (7074 variable)
## initial value 77366.859125
## iter 10 value 13777.297553
## iter 20 value 8741.917348
## iter 30 value 7812.464535
## iter 40 value 7156.458278
## iter 50 value 6334.186620
## iter 60 value 5445.048983
## iter 70 value 4559.672206
## iter 80 value 4123.623320
## iter 90 value 3812.516435
## iter 100 value 3569.730302
## final value 3569.730302
## stopped after 100 iterations
predictions <- predict(mm, test_df)
test_labels <- as.factor(test_labels)
confusionMatrix(predictions,test_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 812 0 0 0 0 0 0 0 0 0
## 1 3 863 15 3 0 0 0 0 0 0
## 2 1 1 783 28 10 0 0 0 0 0
## 3 0 1 24 803 7 12 0 0 0 0
## 4 0 0 21 4 832 11 2 1 0 0
## 5 0 0 0 10 5 668 17 3 14 1
## 6 0 0 8 2 15 12 779 0 7 0
## 7 0 0 2 2 3 8 2 866 5 13
## 8 0 0 0 7 1 22 3 11 784 9
## 9 0 0 0 0 1 4 2 17 25 835
##
## Overall Statistics
##
## Accuracy : 0.9554
## 95% CI : (0.9507, 0.9597)
## No Information Rate : 0.1069
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9504
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.99510 0.9977 0.91794 0.9348 0.95195 0.90638
## Specificity 1.00000 0.9972 0.99470 0.9942 0.99482 0.99348
## Pos Pred Value 1.00000 0.9762 0.95140 0.9481 0.95522 0.93036
## Neg Pred Value 0.99947 0.9997 0.99076 0.9926 0.99442 0.99102
## Prevalence 0.09714 0.1030 0.10155 0.1023 0.10405 0.08774
## Detection Rate 0.09667 0.1027 0.09321 0.0956 0.09905 0.07952
## Detection Prevalence 0.09667 0.1052 0.09798 0.1008 0.10369 0.08548
## Balanced Accuracy 0.99755 0.9975 0.95632 0.9645 0.97338 0.94993
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.96770 0.9644 0.93892 0.9732
## Specificity 0.99421 0.9953 0.99299 0.9935
## Pos Pred Value 0.94654 0.9612 0.93668 0.9446
## Neg Pred Value 0.99657 0.9957 0.99326 0.9969
## Prevalence 0.09583 0.1069 0.09940 0.1021
## Detection Rate 0.09274 0.1031 0.09333 0.0994
## Detection Prevalence 0.09798 0.1073 0.09964 0.1052
## Balanced Accuracy 0.98095 0.9798 0.96596 0.9833
You are to compete in the House Prices: Advanced Regression Techniques competition https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points
housing<-read_csv("https://raw.githubusercontent.com/ferrysany/CUNY-Data621/main/train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(housing)
## [1] 1460 81
#Select LotArea, OverallCond and Yearbuilt as the independent variables for univariate descriptive statistics and SalePrice as the dependent variable.
housing_select<-housing%>%
dplyr::select('LotArea', 'YearBuilt', 'OverallCond', 'SalePrice')
summary(housing_select)
## LotArea YearBuilt OverallCond SalePrice
## Min. : 1300 Min. :1872 Min. :1.000 Min. : 34900
## 1st Qu.: 7554 1st Qu.:1954 1st Qu.:5.000 1st Qu.:129975
## Median : 9478 Median :1973 Median :5.000 Median :163000
## Mean : 10517 Mean :1971 Mean :5.575 Mean :180921
## 3rd Qu.: 11602 3rd Qu.:2000 3rd Qu.:6.000 3rd Qu.:214000
## Max. :215245 Max. :2010 Max. :9.000 Max. :755000
housing_select_cor<-cor(housing_select)
housing_select_cor
## LotArea YearBuilt OverallCond SalePrice
## LotArea 1.00000000 0.01422765 -0.00563627 0.26384335
## YearBuilt 0.01422765 1.00000000 -0.37598320 0.52289733
## OverallCond -0.00563627 -0.37598320 1.00000000 -0.07785589
## SalePrice 0.26384335 0.52289733 -0.07785589 1.00000000
corrplot(housing_select_cor, method='color', type='lower', sig.level=0.01)
y<-housing_select$SalePrice
housing_plot<-function(x, y){
d<-as.data.frame(cbind(x,y))
ggplot(d, aes(x=x, y=y))+
geom_point()+
geom_smooth(method=lm)
#geom_text_repel(aes(label=ifelse(.data[[i]]<=cutoff, Name, "")))
}
for(i in 1:(ncol(housing_select)-1)){
x<-housing_select[[i]]
cor_result<-cor.test(x, y, method='pearson', conf.level=0.8)
print(housing_plot(x,y))
print(paste('Hypothesis test for the correlations', colnames(housing_select)[i], 'and SalePrice'))
print(cor_result)
}
## `geom_smooth()` using formula = 'y ~ x'
## [1] "Hypothesis test for the correlations LotArea and SalePrice"
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
## `geom_smooth()` using formula = 'y ~ x'
## [1] "Hypothesis test for the correlations YearBuilt and SalePrice"
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4980766 0.5468619
## sample estimates:
## cor
## 0.5228973
## `geom_smooth()` using formula = 'y ~ x'
## [1] "Hypothesis test for the correlations OverallCond and SalePrice"
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.1111272 -0.0444103
## sample estimates:
## cor
## -0.07785589
LotArea - the hypothesis that the the correlations is zero can be rejected Yearbuild - the hypothesis that the the correlations is zero can be rejected OverallCond - the hypothesis that the the correlations is zero can be rejected
The concept of a familywise error rate as the probability of making a Type I error among a specified group. It is not likely to have type I error because the P values are relatively low and the 80% intervals don’t include zero.
Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points
p<-solve(housing_select_cor)
print('The precision matrix is')
## [1] "The precision matrix is"
p
## LotArea YearBuilt OverallCond SalePrice
## LotArea 1.10230050 0.2132917 0.05541643 -0.3980498
## YearBuilt 0.21329169 1.6713025 0.56056051 -0.8865523
## OverallCond 0.05541643 0.5605605 1.19435417 -0.2147493
## SalePrice -0.39804982 -0.8865523 -0.21474934 1.5518791
pcor<-p%*%housing_select_cor
pcor
## LotArea YearBuilt OverallCond SalePrice
## LotArea 1.000000e+00 -2.775558e-17 -6.938894e-18 0.000000e+00
## YearBuilt -8.326673e-17 1.000000e+00 1.110223e-16 -1.110223e-16
## OverallCond -2.081668e-17 0.000000e+00 1.000000e+00 0.000000e+00
## SalePrice 1.110223e-16 0.000000e+00 4.163336e-17 1.000000e+00
corp<-housing_select_cor%*%p
corp
## LotArea YearBuilt OverallCond SalePrice
## LotArea 1.000000e+00 0.000000e+00 6.938894e-18 0.000000e+00
## YearBuilt -2.775558e-17 1.000000e+00 2.775558e-17 -1.110223e-16
## OverallCond -3.469447e-18 9.714451e-17 1.000000e+00 -1.387779e-17
## SalePrice 5.551115e-17 0.000000e+00 5.551115e-17 1.000000e+00
#LU decomposition for correlation matrix
lu_hc<-lu(housing_select_cor)
expand(lu_hc)
## $L
## 4 x 4 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 . . .
## [2,] 0.01422765 1.00000000 . .
## [3,] -0.00563627 -0.37597911 1.00000000 .
## [4,] 0.26384335 0.51924857 0.13838020 1.00000000
##
## $U
## 4 x 4 Matrix of class "dtrMatrix"
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 0.01422765 -0.00563627 0.26384335
## [2,] . 0.99979757 -0.37590300 0.51914346
## [3,] . . 0.85863655 0.11881830
## [4,] . . . 0.64438008
##
## $P
## 4 x 4 sparse Matrix of class "pMatrix"
##
## [1,] | . . .
## [2,] . | . .
## [3,] . . | .
## [4,] . . . |
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/Rdevel/library/MASS/html/fitdistr.html ). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable.
ggplot(housing_select, aes(SalePrice))+
geom_histogram()
Select SalePrice as the right skewed data.
exp_fit<-fitdistr(housing_select$SalePrice, 'exponential')
set.seed(9898)
exp_dist<-rexp(1000, exp_fit$estimate)
hist(exp_dist)
It
looks more like an exponential distribution.
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points
#5th and 95th percentiles using the cumulative distribution function (CDF)
qexp(c(0.05,0.95), rate=exp_fit$estimate)
## [1] 9280.044 541991.465
#95% confidence interval from the empirical data
qnorm(c(0.025, 0.975), mean = mean(housing_select$SalePrice), sd = sd(housing_select$SalePrice))
## [1] 25216.75 336625.64
#Empirical 5th percentile and 95th percentile of the data
quantile(housing_select$SalePrice, c(0.05, 0.95))
## 5% 95%
## 88000 326100
The result from CDF is very different from the actual quantile of the empirical data. The exponential model is incorrect. The normal distribution is closer to the actual at 97.5% but it is much smaller at the other end. The empirical data is not a normal distribution function. Other transformation may require to obtain a better model.
Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. 10 point
I did similar analysis in Data 621 class and borrow the information from my previous work。Several models were tested and only the final one is shown below:
Simplify version
housing_data <- housing
summary(housing_data)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
#Correlation Plot
housing_datadd<-housing_data%>%
dplyr::select(where(is.numeric))
corrplot(cor(housing_datadd), method='color', type='lower', sig.level=0.01)
#rename(FrtflrSF="1stFlrSF", SecFlrSF="2ndFlrSF", ThdSsnPorch="3SsnPorch")
hd_lmod6<-housing_data%>%
rename(FrtflrSF="1stFlrSF", SecFlrSF="2ndFlrSF", ThdSsnPorch="3SsnPorch")
lmod6_loga<-lm(log(SalePrice) ~ MSZoning + LotArea + Street + LotConfig +
LandSlope + Neighborhood + Condition1 + Condition2 + BldgType +
OverallQual + OverallCond + YearBuilt + YearRemodAdd + RoofStyle +
RoofMatl + Exterior1st + MasVnrType + BsmtQual + BsmtExposure +
BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + FrtflrSF + SecFlrSF +
KitchenAbvGr + KitchenQual + Functional + Fireplaces + GarageCars +
GarageArea + GarageQual + GarageCond + WoodDeckSF + ScreenPorch +
PoolArea + SaleType, data = hd_lmod6)
summary(lmod6_loga)
##
## Call:
## lm(formula = log(SalePrice) ~ MSZoning + LotArea + Street + LotConfig +
## LandSlope + Neighborhood + Condition1 + Condition2 + BldgType +
## OverallQual + OverallCond + YearBuilt + YearRemodAdd + RoofStyle +
## RoofMatl + Exterior1st + MasVnrType + BsmtQual + BsmtExposure +
## BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + FrtflrSF + SecFlrSF +
## KitchenAbvGr + KitchenQual + Functional + Fireplaces + GarageCars +
## GarageArea + GarageQual + GarageCond + WoodDeckSF + ScreenPorch +
## PoolArea + SaleType, data = hd_lmod6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74496 -0.04599 0.00366 0.05176 0.74496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.169e+00 7.393e-01 1.581 0.114180
## MSZoningFV 4.974e-01 5.448e-02 9.131 < 2e-16 ***
## MSZoningRH 4.093e-01 5.677e-02 7.210 9.81e-13 ***
## MSZoningRL 4.587e-01 4.698e-02 9.762 < 2e-16 ***
## MSZoningRM 4.002e-01 4.394e-02 9.108 < 2e-16 ***
## LotArea 3.392e-06 4.289e-07 7.908 5.85e-15 ***
## StreetPave 1.540e-01 5.488e-02 2.806 0.005094 **
## LotConfigCulDSac 1.751e-02 1.372e-02 1.276 0.202043
## LotConfigFR2 -3.318e-02 1.781e-02 -1.863 0.062723 .
## LotConfigFR3 -3.645e-02 5.522e-02 -0.660 0.509232
## LotConfigInside -7.968e-03 7.723e-03 -1.032 0.302451
## LandSlopeMod 2.239e-02 1.554e-02 1.441 0.149915
## LandSlopeSev -2.526e-01 4.753e-02 -5.315 1.27e-07 ***
## NeighborhoodBlueste -4.067e-02 8.146e-02 -0.499 0.617725
## NeighborhoodBrDale -4.722e-02 4.678e-02 -1.009 0.313024
## NeighborhoodBrkSide 5.804e-02 4.110e-02 1.412 0.158183
## NeighborhoodClearCr 2.553e-02 3.954e-02 0.646 0.518621
## NeighborhoodCollgCr -6.924e-03 3.043e-02 -0.228 0.820065
## NeighborhoodCrawfor 1.186e-01 3.604e-02 3.292 0.001024 **
## NeighborhoodEdwards -7.442e-02 3.412e-02 -2.181 0.029379 *
## NeighborhoodGilbert -3.470e-03 3.220e-02 -0.108 0.914191
## NeighborhoodIDOTRR 2.643e-02 4.730e-02 0.559 0.576350
## NeighborhoodMeadowV -1.784e-01 4.914e-02 -3.630 0.000295 ***
## NeighborhoodMitchel -3.044e-02 3.479e-02 -0.875 0.381875
## NeighborhoodNAmes -2.945e-02 3.266e-02 -0.902 0.367339
## NeighborhoodNoRidge 4.140e-02 3.442e-02 1.203 0.229287
## NeighborhoodNPkVill 1.108e-02 4.674e-02 0.237 0.812643
## NeighborhoodNridgHt 7.771e-02 3.104e-02 2.503 0.012437 *
## NeighborhoodNWAmes -4.260e-02 3.361e-02 -1.268 0.205178
## NeighborhoodOldTown 9.060e-03 4.180e-02 0.217 0.828432
## NeighborhoodSawyer -1.312e-02 3.454e-02 -0.380 0.704128
## NeighborhoodSawyerW 1.199e-03 3.333e-02 0.036 0.971311
## NeighborhoodSomerst 2.126e-02 3.838e-02 0.554 0.579772
## NeighborhoodStoneBr 1.172e-01 3.471e-02 3.376 0.000760 ***
## NeighborhoodSWISU 3.097e-03 4.152e-02 0.075 0.940541
## NeighborhoodTimber -9.370e-04 3.408e-02 -0.027 0.978069
## NeighborhoodVeenker 4.853e-02 4.484e-02 1.082 0.279328
## Condition1Feedr 6.623e-02 2.340e-02 2.830 0.004730 **
## Condition1Norm 1.015e-01 1.888e-02 5.375 9.17e-08 ***
## Condition1PosA 8.257e-02 4.320e-02 1.912 0.056163 .
## Condition1PosN 1.030e-01 3.219e-02 3.201 0.001407 **
## Condition1RRAe -2.050e-02 3.963e-02 -0.517 0.605005
## Condition1RRAn 8.074e-02 3.021e-02 2.673 0.007628 **
## Condition1RRNe 7.943e-02 7.664e-02 1.036 0.300229
## Condition1RRNn 1.444e-01 5.482e-02 2.633 0.008562 **
## Condition2Feedr -1.012e-01 1.008e-01 -1.004 0.315495
## Condition2Norm -1.107e-02 8.507e-02 -0.130 0.896475
## Condition2PosA 2.698e-01 1.365e-01 1.976 0.048343 *
## Condition2PosN -9.426e-01 1.180e-01 -7.990 3.12e-15 ***
## Condition2RRAe -4.929e-01 1.896e-01 -2.600 0.009433 **
## Condition2RRAn -6.588e-02 1.352e-01 -0.487 0.626123
## Condition2RRNn -1.172e-01 1.148e-01 -1.021 0.307574
## BldgType2fmCon -4.841e-02 2.855e-02 -1.696 0.090206 .
## BldgTypeDuplex -7.556e-02 3.243e-02 -2.330 0.019994 *
## BldgTypeTwnhs -1.013e-01 2.371e-02 -4.270 2.11e-05 ***
## BldgTypeTwnhsE -4.201e-02 1.554e-02 -2.703 0.006960 **
## OverallQual 4.587e-02 4.430e-03 10.354 < 2e-16 ***
## OverallCond 4.048e-02 3.672e-03 11.023 < 2e-16 ***
## YearBuilt 2.398e-03 3.024e-04 7.930 4.96e-15 ***
## YearRemodAdd 7.711e-04 2.477e-04 3.113 0.001898 **
## RoofStyleGable -2.116e-02 7.849e-02 -0.270 0.787562
## RoofStyleGambrel -3.232e-02 8.556e-02 -0.378 0.705740
## RoofStyleHip -1.982e-02 7.875e-02 -0.252 0.801313
## RoofStyleMansard 6.594e-02 9.201e-02 0.717 0.473735
## RoofStyleShed 4.520e-01 1.480e-01 3.054 0.002311 **
## RoofMatlCompShg 2.771e+00 1.315e-01 21.071 < 2e-16 ***
## RoofMatlMembran 3.179e+00 1.961e-01 16.208 < 2e-16 ***
## RoofMatlMetal 3.085e+00 1.927e-01 16.007 < 2e-16 ***
## RoofMatlRoll 2.947e+00 1.712e-01 17.219 < 2e-16 ***
## RoofMatlTar&Grv 2.796e+00 1.533e-01 18.243 < 2e-16 ***
## RoofMatlWdShake 2.686e+00 1.471e-01 18.253 < 2e-16 ***
## RoofMatlWdShngl 2.787e+00 1.370e-01 20.345 < 2e-16 ***
## Exterior1stBrkComm -4.301e-01 1.140e-01 -3.773 0.000169 ***
## Exterior1stBrkFace 1.135e-01 3.403e-02 3.336 0.000875 ***
## Exterior1stCBlock 3.157e-02 1.091e-01 0.289 0.772283
## Exterior1stCemntBd 8.221e-02 3.535e-02 2.326 0.020188 *
## Exterior1stHdBoard 2.375e-02 3.093e-02 0.768 0.442775
## Exterior1stImStucc 7.778e-02 1.083e-01 0.718 0.472748
## Exterior1stMetalSd 6.431e-02 3.052e-02 2.107 0.035326 *
## Exterior1stPlywood 2.042e-02 3.243e-02 0.630 0.529058
## Exterior1stStone 2.418e-02 8.690e-02 0.278 0.780901
## Exterior1stStucco 5.700e-02 3.800e-02 1.500 0.133794
## Exterior1stVinylSd 6.237e-02 3.069e-02 2.032 0.042342 *
## Exterior1stWd Sdng 3.165e-02 3.033e-02 1.043 0.296981
## Exterior1stWdShing 2.560e-02 3.806e-02 0.673 0.501286
## MasVnrTypeBrkFace 6.833e-02 2.920e-02 2.340 0.019429 *
## MasVnrTypeNone 5.453e-02 2.875e-02 1.897 0.058106 .
## MasVnrTypeStone 7.245e-02 3.090e-02 2.345 0.019205 *
## BsmtQualFa -4.815e-02 2.719e-02 -1.771 0.076867 .
## BsmtQualGd -3.690e-02 1.392e-02 -2.652 0.008117 **
## BsmtQualTA -5.091e-02 1.728e-02 -2.945 0.003290 **
## BsmtExposureGd 3.189e-02 1.309e-02 2.436 0.014978 *
## BsmtExposureMn 2.685e-03 1.286e-02 0.209 0.834610
## BsmtExposureNo -1.215e-02 8.889e-03 -1.366 0.172110
## BsmtFinSF1 1.628e-04 2.003e-05 8.125 1.10e-15 ***
## BsmtFinSF2 1.316e-04 2.568e-05 5.124 3.49e-07 ***
## BsmtUnfSF 8.232e-05 1.973e-05 4.173 3.22e-05 ***
## FrtflrSF 2.651e-04 2.098e-05 12.634 < 2e-16 ***
## SecFlrSF 2.619e-04 9.985e-06 26.229 < 2e-16 ***
## KitchenAbvGr -5.089e-02 2.871e-02 -1.772 0.076586 .
## KitchenQualFa -6.974e-02 2.846e-02 -2.450 0.014408 *
## KitchenQualGd -5.269e-02 1.439e-02 -3.661 0.000262 ***
## KitchenQualTA -5.496e-02 1.653e-02 -3.324 0.000914 ***
## FunctionalMaj2 -1.376e-01 6.575e-02 -2.093 0.036539 *
## FunctionalMin1 2.448e-02 4.017e-02 0.609 0.542420
## FunctionalMin2 2.954e-02 3.987e-02 0.741 0.458885
## FunctionalMod -5.039e-02 4.984e-02 -1.011 0.312209
## FunctionalSev -3.792e-01 1.197e-01 -3.168 0.001574 **
## FunctionalTyp 6.907e-02 3.504e-02 1.971 0.048907 *
## Fireplaces 2.566e-02 5.810e-03 4.416 1.09e-05 ***
## GarageCars 2.516e-02 9.846e-03 2.556 0.010721 *
## GarageArea 1.046e-04 3.239e-05 3.229 0.001276 **
## GarageQualFa -4.389e-01 1.199e-01 -3.662 0.000261 ***
## GarageQualGd -3.780e-01 1.230e-01 -3.074 0.002159 **
## GarageQualPo -5.213e-01 1.474e-01 -3.536 0.000422 ***
## GarageQualTA -4.065e-01 1.187e-01 -3.425 0.000635 ***
## GarageCondFa 3.108e-01 1.415e-01 2.196 0.028255 *
## GarageCondGd 3.734e-01 1.447e-01 2.581 0.009982 **
## GarageCondPo 4.240e-01 1.520e-01 2.790 0.005361 **
## GarageCondTA 3.711e-01 1.398e-01 2.654 0.008066 **
## WoodDeckSF 7.449e-05 2.549e-05 2.922 0.003541 **
## ScreenPorch 1.900e-04 5.262e-05 3.610 0.000319 ***
## PoolArea 1.330e-04 7.536e-05 1.765 0.077802 .
## SaleTypeCon 1.074e-01 7.710e-02 1.393 0.163740
## SaleTypeConLD 1.444e-01 4.858e-02 2.973 0.003011 **
## SaleTypeConLI -5.701e-02 5.564e-02 -1.025 0.305766
## SaleTypeConLw 2.232e-02 5.537e-02 0.403 0.686959
## SaleTypeCWD 4.836e-02 5.529e-02 0.875 0.381927
## SaleTypeNew 4.994e-02 2.121e-02 2.355 0.018692 *
## SaleTypeOth 7.890e-02 1.036e-01 0.761 0.446644
## SaleTypeWD 6.764e-03 1.734e-02 0.390 0.696500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1014 on 1209 degrees of freedom
## (因为不存在,120个观察量被删除了)
## Multiple R-squared: 0.9346, Adjusted R-squared: 0.9276
## F-statistic: 132.9 on 130 and 1209 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lmod6_loga)
sqrt(sum(lmod6_loga$residuals^2) / lmod6_loga$df)
## [1] 0.101422
rmse(hd_lmod6$SalePrice, exp(lmod6_loga$fitted.values))
## [1] 111218.1
#r Predict
test_housing<-read_csv("https://raw.githubusercontent.com/ferrysany/CUNY-Data621/main/test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(test_housing)
## [1] 1459 80
test_housing_data <- test_housing %>%
rename(FrtflrSF="1stFlrSF", SecFlrSF="2ndFlrSF", ThdSsnPorch="3SsnPorch")
# test_housing_data1<-test_housing_data1%>%
# dplyr::select(MSZoning, LotArea, Street, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, RoofMatl, Exterior1st, MasVnrType, BsmtQual, BsmtExposure, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, FrtflrSF, SecFlrSF, KitchenAbvGr, KitchenQual,Functional,Fireplaces, GarageCars, GarageArea, GarageQual, GarageCond, WoodDeckSF, ScreenPorch, PoolArea, SaleType)
test_housing_data1<-test_housing_data
test_housing_data1$Exterior1st[which(!(test_housing_data1$Exterior1st %in% unique(hd_lmod6$Exterior1st)))]<-NA
#There is an error when I run the code below, I can't figure out how to remove the extra level in one of the variables.
#response_predict <- predict(lmod6_log, test_housing_data1, type='response')
#response_predict <- predict(lmod6_loga, test_housing_data1)
#res <- data.frame(Id = test_housing$Id, SalePrice = exp(response_predict))
#write.csv(res, file = "price_step.csv", row.names = FALSE)