# Note I had to load the MASS package prior to the tidyverse packages
# to prevent conflicts amongst functions, e.g. dplyr::select, MASS::select
library(MASS)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(cowplot)
https://www.kaggle.com/c/house-prices-advanced-regression-techniques.
Pick one of the quantitative independent variables from the training data set (train.csv), and define that variable as X. Make sure this variable is skewed to the right!
Pick the dependent variable and define it as Y.
train <- read_csv("/Users/anjalibm.com/Documents/DataScience/DATA_605/Final/data/train.csv", na = c("", "NA"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## Id = col_double(),
## MSSubClass = col_double(),
## LotFrontage = col_double(),
## LotArea = col_double(),
## OverallQual = col_double(),
## OverallCond = col_double(),
## YearBuilt = col_double(),
## YearRemodAdd = col_double(),
## MasVnrArea = col_double(),
## BsmtFinSF1 = col_double(),
## BsmtFinSF2 = col_double(),
## BsmtUnfSF = col_double(),
## TotalBsmtSF = col_double(),
## `1stFlrSF` = col_double(),
## `2ndFlrSF` = col_double(),
## LowQualFinSF = col_double(),
## GrLivArea = col_double(),
## BsmtFullBath = col_double(),
## BsmtHalfBath = col_double(),
## FullBath = col_double()
## # ... with 18 more columns
## )
## ℹ Use `spec()` for the full column specifications.
test <- read_csv("/Users/anjalibm.com/Documents/DataScience/DATA_605/Final/data/test.csv", na = c("", "NA"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## Id = col_double(),
## MSSubClass = col_double(),
## LotFrontage = col_double(),
## LotArea = col_double(),
## OverallQual = col_double(),
## OverallCond = col_double(),
## YearBuilt = col_double(),
## YearRemodAdd = col_double(),
## MasVnrArea = col_double(),
## BsmtFinSF1 = col_double(),
## BsmtFinSF2 = col_double(),
## BsmtUnfSF = col_double(),
## TotalBsmtSF = col_double(),
## `1stFlrSF` = col_double(),
## `2ndFlrSF` = col_double(),
## LowQualFinSF = col_double(),
## GrLivArea = col_double(),
## BsmtFullBath = col_double(),
## BsmtHalfBath = col_double(),
## FullBath = col_double()
## # ... with 17 more columns
## )
## ℹ Use `spec()` for the full column specifications.
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
\[ P(X > x | Y > y) = \frac{P(X > x \cap Y > y)}{P(Y > y)} \]
In words, we’re looking for the conditional probability that the OpenPorchSF variable is greater than its first quantile, given that SalePrice is greather than its first quantile.
# Get first quantile for each variable
x_fq <- quantile(train$OpenPorchSF, 0.25)
y_fq <- quantile(train$SalePrice, 0.25)
denominator <- train[train$SalePrice > y_fq, ]
numerator <- denominator[denominator$OpenPorchSF > x_fq, ]
paste0("Probability = ", nrow(numerator) / nrow(denominator))
## [1] "Probability = 0.647488584474886"
rm(denominator, numerator)
\[ P(X > x, Y > Y) = P(X > x \cap Y > y) \]
jointp <- train[train$OpenPorchSF > x_fq & train$SalePrice > y_fq, ]
paste0("Probability = ", nrow(jointp) / nrow(train))
## [1] "Probability = 0.485616438356164"
rm(jointp)
\[ P(X < x | Y > y) = \frac{P(X < x \cap Y > y)}{P(Y > y)} \]
denominator <- train[train$SalePrice > y_fq, ]
numerator <- denominator[denominator$OpenPorchSF < x_fq, ]
paste0("Probability = ", nrow(numerator) / nrow(denominator))
## [1] "Probability = 0"
oo <- nrow(train[train$OpenPorchSF <= x_fq & train$SalePrice <= y_fq, ])
ot <- nrow(train[train$OpenPorchSF <= x_fq & train$SalePrice > y_fq, ])
or <- oo + ot
to <- nrow(train[train$OpenPorchSF > x_fq & train$SalePrice <= y_fq, ])
tt <- nrow(train[train$OpenPorchSF > x_fq & train$SalePrice > y_fq, ])
tr <- to + tt
ro <- oo + to
rt <- ot + tt
rr <- ro + rt
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
For brevity’s sake, I’ll only summarize the last 10 (numeric) columns.
train %>%
select_if(is.numeric) %>%
psych::describe() %>%
tail(10)
## vars n mean sd median trimmed mad min
## WoodDeckSF 29 1460 94.24 125.34 0 71.76 0.00 0
## OpenPorchSF 30 1460 46.66 66.26 25 33.23 37.06 0
## EnclosedPorch 31 1460 21.95 61.12 0 3.87 0.00 0
## 3SsnPorch 32 1460 3.41 29.32 0 0.00 0.00 0
## ScreenPorch 33 1460 15.06 55.76 0 0.00 0.00 0
## PoolArea 34 1460 2.76 40.18 0 0.00 0.00 0
## MiscVal 35 1460 43.49 496.12 0 0.00 0.00 0
## MoSold 36 1460 6.32 2.70 6 6.25 2.97 1
## YrSold 37 1460 2007.82 1.33 2008 2007.77 1.48 2006
## SalePrice 38 1460 180921.20 79442.50 163000 170783.29 56338.80 34900
## max range skew kurtosis se
## WoodDeckSF 857 857 1.54 2.97 3.28
## OpenPorchSF 547 547 2.36 8.44 1.73
## EnclosedPorch 552 552 3.08 10.37 1.60
## 3SsnPorch 508 508 10.28 123.06 0.77
## ScreenPorch 480 480 4.11 18.34 1.46
## PoolArea 738 738 14.80 222.19 1.05
## MiscVal 15500 15500 24.43 697.64 12.98
## MoSold 12 11 0.21 -0.41 0.07
## YrSold 2010 4 0.10 -1.19 0.03
## SalePrice 755000 720100 1.88 6.50 2079.11
I will do boxplots for numeric variables, and bar charts for categorical data.
train.bp <- train %>%
select_if(is.numeric) %>%
select(-matches("*Year* | *yr*"), -c(Id, SalePrice, LotArea)) %>%
gather()
ggplot(train.bp, aes(x = key, y = value)) +
labs(x = "variable", title = "Training Data Boxplot") +
geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 267 rows containing non-finite values (stat_boxplot).
rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, train.bp, denominator, numerator)
## Warning in rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, :
## object 'pa' not found
## Warning in rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, :
## object 'pab' not found
## Warning in rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, :
## object 'pb' not found
train.bc <- train %>%
select_if(is.character) %>%
gather()
ggplot(data = train.bc, mapping = aes(x = value)) +
geom_bar() +
facet_wrap(~key, scales = 'free', ncol = 4) +
theme(axis.text.x = element_text(angle = 90))
rm(train.bc)
ggplot(data = train, aes(x = OpenPorchSF, y = SalePrice)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
colnames(train)[colnames(train) == '1stFlrSF'] <- 'firstFlrSF'
colnames(train)[colnames(train) == '2ndFlrSF'] <- 'secFlrSF'
colnames(train)[colnames(train) == '3SsnPorch'] <- 'threeSsnPorch'
colnames(test)[colnames(test) == '1stFlrSF'] <- 'firstFlrSF'
colnames(test)[colnames(test) == '2ndFlrSF'] <- 'secFlrSF'
colnames(test)[colnames(test) == '3SsnPorch'] <- 'threeSsnPorch'
cors <- cor(train[, c("MSSubClass", "firstFlrSF", "GrLivArea")])
\(H_0:\) The correlation coefficient is 0, which is to say that these variables aren’t related. \(H_1:\) The correlation coefficient is not 0.
cor.test(train$MSSubClass, train$firstFlrSF, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train$MSSubClass and train$firstFlrSF
## t = -9.933, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## -0.2941963 -0.2083297
## sample estimates:
## cor
## -0.2517584
cor.test(train$MSSubClass, train$GrLivArea, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train$MSSubClass and train$GrLivArea
## t = 2.8662, df = 1458, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.02912052 0.12027312
## sample estimates:
## cor
## 0.07485318
cor.test(train$firstFlrSF, train$GrLivArea, conf.level = 0.92)
##
## Pearson's product-moment correlation
##
## data: train$firstFlrSF and train$GrLivArea
## t = 26.217, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
## 0.5340458 0.5963849
## sample estimates:
## cor
## 0.566024
Since the p-value for all three tests is much smaller than \(\alpha = 0.08\), we can reject the null hypothesis, and conclude that there is some correlation between the variables.
According to this wikipedia article, familywise error is a type 1 error, which occurs when one rejects a true null hypothesis (aka a false positive). The probability of making one or more of these can be controlled by ensuring \[ FWER = P(V \geq 1) = 1 - P(v = 0) \leq \alpha \]
\(FWER = 1-(1-0.08)^3 =\) 0.221312
So we should proceed with caution, since there is a ~22% chance of making a type 1 error.
Invert your \(3 \times 3\) correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
Invert our correlation matrix
precm <- solve(cors)
Multiplying the correlation matrix by the precision (inverted) matrix
round(cors %*% precm)
## MSSubClass firstFlrSF GrLivArea
## MSSubClass 1 0 0
## firstFlrSF 0 1 0
## GrLivArea 0 0 1
round(precm %*% cors)
## MSSubClass firstFlrSF GrLivArea
## MSSubClass 1 0 0
## firstFlrSF 0 1 0
## GrLivArea 0 0 1
The result of the matrix multiplcation is the identity matrix.
To perform the LU decomposition, I’ll recycle the function I created for assignment 2.
factorize <- function(mat){
L <- diag(nrow(mat)) # Create lower diagonal matrix of the same size
U <- mat
# If the leading entry [1,1] is not > 1, swap it with a row that has a pivot
if (U[1,1] == 0){
for (i in U[2,1]:U[nrow,1]){
if (U[i,1] != 0){
U[1,] <- U[i,]
}
}
}
# Loop through n-1 columns, n rows, get the multipler,
# Multiply to get the Upper Triangular matrix, and plug
# the multiplier into the Lower Triangular Matrix
for (k in 1:(nrow(U)-1)){
for (j in (k+1):(nrow(U))){
if (U[j,k] != 0){
mult <- U[j,k] / U[k,k]
U[j,] <- U[j,] - (mult * U[k,])
L[j,k] <- mult
}
}
}
factored.matrix <- list(L, U)
return(factored.matrix)
}
factorize(cors)
## [[1]]
## [,1] [,2] [,3]
## [1,] 1.00000000 0.0000000 0
## [2,] -0.25175835 1.0000000 0
## [3,] 0.07485318 0.6244478 1
##
## [[2]]
## MSSubClass firstFlrSF GrLivArea
## MSSubClass 1 -0.2517584 0.07485318
## firstFlrSF 0 0.9366177 0.58486888
## GrLivArea 0 0.0000000 0.62917691
Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
Since the minimum OpenPorchSF value is 0, I’ll shift it to the right by 0.0001.
shifted <- train$OpenPorchSF- min(train$OpenPorchSF) + 0.0001
expdf <- fitdistr(shifted, densfun = "exponential")
lambda <- expdf$estimate
lambdas <- rexp(1000, lambda)
The optimal lambda value is \(\lambda =\) 0.0214315.
plot_grid(ggplot() +
geom_histogram(aes(lambdas), fill=("blue"), col=("red"), alpha=(.2)),
ggplot(data = train, aes(x = OpenPorchSF)) +
geom_histogram(fill = "blue", colour = "red", alpha = 0.2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From Wikipedia, the Cumulative Distribution Function for the Exponential Distribution is:
\[ F(x;\lambda) = 1 - e^{-\lambda x} \]
The quantile function is thus
\[ F^{-1}(p;\lambda) = \frac{-\ln(1 - p)}{\lambda}, \ \ \ \ 0 \leq p < 1 \]
paste0("5th Percentile of simulated data = ", log(1 - 0.05)/-lambda)
## [1] "5th Percentile of simulated data = 2.39336429840992"
paste0("95th Percentile of simulated data = ", log(1 - 0.95)/-lambda)
## [1] "95th Percentile of simulated data = 139.781988205825"
paste0("5th Percentile of empirical data = ", quantile(shifted, 0.05))
## [1] "5th Percentile of empirical data = 1e-04"
paste0("95th Percentile of empirical data = ", quantile(shifted, 0.95))
## [1] "95th Percentile of empirical data = 175.0501"
The formula for a confidence interval on normally distributed data is \[ x \pm t_{\frac{\sigma}{2}, n-1}\cdot \frac{s}{\sqrt{n}} \]
The Z-value for a 95% confidence interval is 1.96.
paste0("95% Confidence interval for empirical data = (", mean(shifted) - 1.96 * sd(shifted) / sqrt(length(shifted)), ", ", mean(shifted) + 1.96 * sd(shifted) / sqrt(length(shifted)), ")")
## [1] "95% Confidence interval for empirical data = (43.2617349622305, 50.059012982975)"
5% of houses in the sample have an OpenPorchSF value of less than 0.0001$, while 95% are below 175.0501. The 95% confidence interval tells us that with each sampling of the population (house prices), we are 95% sure the mean will be between ~43.26 and ~50.06. Due to the large difference in percentile values between the sampled and empirical data, the exponential distribution may not be the best fit for our dataset.
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
unloadNamespace("MASS")
colSums(is.na(train))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 1369 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical firstFlrSF secFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch threeSsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
# Convert training dataset
trf <- train %>%
select(-c(Id, Alley, BldgType, LotFrontage, LowQualFinSF, PoolArea, PoolQC, Utilities, MiscFeature, MiscVal, Street))
trf <- trf[!(trf$YearBuilt %in% c(1893)), ]
trf %<>%
mutate_if(is.character, as_factor)
trf[, c("GarageYrBlt", "MoSold", "YrSold", "YearBuilt", "YearRemodAdd")] %<>%
mutate_all(as.factor)
trf$MSSubClass <- factor(trf$MSSubClass, levels = c(20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180))
trf[, c("OverallQual", "OverallCond", "ExterQual", "ExterCond")] %<>%
mutate_all(as.ordered)
trf <- mutate_at(trf, vars(matches("ExterQ.*")), fct_relevel, ... = "Fa", "TA", "Gd", "Ex")
trf <- mutate_at(trf, vars(matches("ExterC.*")), fct_relevel, ... = "Po", "Fa", "TA", "Gd", "Ex")
# trf$LotFrontage[is.na(trf$LotFrontage)] <- 0
trf <- mutate_at(trf, colnames(trf)[colSums(is.na(trf)) > 0], addNA)
trf$MasVnrArea %<>% as.numeric()
# Convert testing dataset
tf <- test %>%
select(-c(Id, Alley, BldgType, LotFrontage, LowQualFinSF, PoolArea, PoolQC, Utilities, MiscFeature, MiscVal, Street))
tf <- tf[!(tf$MSSubClass %in% c(150)), ]
tf <- tf[!(tf$MSZoning %in% c(NA)), ]
tf <- tf[!(tf$YearBuilt %in% c(1879, 1895, 1896, 1901, 1902, 1907)), ]
tf <- tf[!(tf$Exterior1st %in% c(NA)), ]
tf <- tf[!(tf$KitchenQual %in% c(NA)), ]
tf <- tf[!(tf$Functional %in% c(NA)), ]
tf <- tf[!(tf$GarageYrBlt %in% c(1917, 1919, 1943, 2207)), ]
tf <- tf[!(tf$SaleType %in% c(NA)), ]
tf <- tf[!(tf$BsmtFinSF1 %in% c(NA)), ]
tf <- tf[!(tf$BsmtFullBath %in% c(NA)), ]
tf <- tf[!(tf$GarageCars %in% c(NA)), ]
tf <- tf[!(tf$Condition2 %in% c("PosA")), ]
tf %<>%
mutate_if(is.character, as_factor)
tf[, c("GarageYrBlt", "MoSold", "YrSold", "YearBuilt", "YearRemodAdd")] %<>%
mutate_all(as.factor)
tf$MSSubClass <- factor(tf$MSSubClass, levels = c(20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180))
tf[, c("OverallQual", "OverallCond", "ExterQual", "ExterCond")] %<>%
mutate_all(as.ordered)
tf <- mutate_at(tf, vars(matches("ExterQ.*")), fct_relevel, ... = "Fa", "TA", "Gd", "Ex")
tf <- mutate_at(tf, vars(matches("ExterC.*")), fct_relevel, ... = "Po", "Fa", "TA", "Gd", "Ex")
# tf$LotFrontage[is.na(tf$LotFrontage)] <- 0
tf <- mutate_at(tf, colnames(tf)[colSums(is.na(tf)) > 0], addNA)
tf$MasVnrArea %<>% as.numeric()
# m1 <- lm(SalePrice ~ ., data = trf)
#
# summary(m1)
#
# predict(m1, newdata = tf, type = "response")
There are many variables that can be recoded. Many columns can be converted from numeric to categorical (factor). Some columns, especially those with a high number of missing values, can be converted to factors with two levels - those with the property, and those without it (1 and 0). This will allow us to still use the variable, and not have to discard it. Lastly, we can convert four columns containing quality information to ordered factors.
an NA in the PoolQC column probably means that the house doesn’t have a pool.
m1 <- lm(SalePrice ~ ., data = trf)
# summary(m1)
# length(m1$coefficients) > m1$rank
rp1 <- ggplot(m1, aes(.fitted, .resid)) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE) +
labs(title = "Residuals vs Fitted")
rp2 <- ggplot(m1, aes(.fitted, .stdresid)) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE)
rp3 <- ggplot(m1) +
stat_qq(aes(sample = .stdresid)) +
geom_abline()
rp4 <- ggplot(m1, aes(.fitted, sqrt(abs(.stdresid)))) +
geom_point() +
geom_smooth(se = FALSE) +
labs(title = "Scale-Location")
rp5 <- ggplot(m1, aes(seq_along(.cooksd), .cooksd)) +
geom_col() +
ylim(0, 0.0075) +
labs(title = "Cook's Distance")
rp6 <- ggplot(m1, aes(.hat, .stdresid)) +
geom_point(aes(size = .cooksd)) +
geom_smooth(se = FALSE, size = 0.5) +
labs(title = "Residuals vs Leverage")
rp7 <- ggplot(m1, aes(.hat, .cooksd)) +
geom_vline(xintercept = 0, colour = NA) +
geom_abline(slope = seq(0, 3, by = 0.5), colour = "white") +
geom_smooth(se = FALSE) +
geom_point() +
labs(title = "Cook's distance vs Leverage")
rp8 <- ggplot(m1, aes(.resid)) +
geom_histogram(bins = 7, color="darkblue", fill="steelblue")
plot_grid(rp1, rp2, rp3, rp4, rp5, rp6, rp7, rp8, ncol = 2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing non-finite values (stat_qq).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 94 rows containing missing values (position_stack).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
# levels(tf) <- union(levels(tf), levels(trf))
preds <- predict(m1, newdata = tf, type = "response")
## Warning in predict.lm(m1, newdata = tf, type = "response"): prediction from a
## rank-deficient fit may be misleading
rdf <- data.frame(ID = 1:1431, Predicted = preds)
head(rdf, 10)
## ID Predicted
## 1 1 112738.1
## 2 2 159855.4
## 3 3 199412.6
## 4 4 200603.6
## 5 5 194311.2
## 6 6 162449.4
## 7 7 173211.2
## 8 8 155506.3
## 9 9 185514.1
## 10 10 119749.0