library(dplyr)
library(tidyr)
library(summarytools)
library(DT)
library(ggplot2)
library(printr)
library(xgboost)
library(caret)
setwd("~/Documents/Data Task 6 & 7")
trainori <- read.csv("train.csv", header = TRUE)
testori <- read.csv("test.csv", header = TRUE)
submission <- read.csv("sample_submission.csv",header = TRUE)
sale_price <- trainori$SalePrice
train <- trainori
train <- train[,!colnames(train) %in% c("SalePrice")]
housejoin <- rbind(train,testori)
print(housejoin)
Before diving into modeling, it’s essential to have a solid grasp of the data. In this section, we’ll perform exploratory data analysis (EDA) to gain insights into our dataset. We’ll visualize key variables, identify outliers, and analyze correlations among features.
# Generate descriptive statistics using summarytools
desc_stats <- descr(housejoin)
desc_stats
Non-numerical variable(s) ignored: MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, HeatingQC, CentralAir, Electrical, KitchenQual, Functional, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PavedDrive, PoolQC, Fence, MiscFeature, SaleType, SaleCondition
Descriptive Statistics
housejoin
N: 2919
BedroomAbvGr BsmtFinSF1 BsmtFinSF2 BsmtFullBath BsmtHalfBath BsmtUnfSF
----------------- -------------- ------------ ------------ -------------- -------------- -----------
Mean 2.86 441.42 49.58 0.43 0.06 560.77
Std.Dev 0.82 455.61 169.21 0.52 0.25 439.54
Min 0.00 0.00 0.00 0.00 0.00 0.00
Q1 2.00 0.00 0.00 0.00 0.00 220.00
Median 3.00 368.50 0.00 0.00 0.00 467.00
Q3 3.00 733.00 0.00 1.00 0.00 806.00
Max 8.00 5644.00 1526.00 3.00 2.00 2336.00
MAD 0.00 546.34 0.00 0.00 0.00 415.13
IQR 1.00 733.00 0.00 1.00 0.00 585.50
CV 0.29 1.03 3.41 1.22 4.00 0.78
Skewness 0.33 1.42 4.14 0.62 3.93 0.92
SE.Skewness 0.05 0.05 0.05 0.05 0.05 0.05
Kurtosis 1.93 6.88 18.79 -0.74 14.81 0.40
N.Valid 2919.00 2918.00 2918.00 2917.00 2917.00 2918.00
Pct.Valid 100.00 99.97 99.97 99.93 99.93 99.97
Table: Table continues below
EnclosedPorch Fireplaces FullBath GarageArea GarageCars GarageYrBlt
----------------- --------------- ------------ ---------- ------------ ------------ -------------
Mean 23.10 0.60 1.57 472.87 1.77 1978.11
Std.Dev 64.24 0.65 0.55 215.39 0.76 25.57
Min 0.00 0.00 0.00 0.00 0.00 1895.00
Q1 0.00 0.00 1.00 320.00 1.00 1960.00
Median 0.00 1.00 2.00 480.00 2.00 1979.00
Q3 0.00 1.00 2.00 576.00 2.00 2002.00
Max 1012.00 4.00 4.00 1488.00 5.00 2207.00
MAD 0.00 1.48 0.00 183.84 0.00 31.13
IQR 0.00 1.00 1.00 256.00 1.00 42.00
CV 2.78 1.08 0.35 0.46 0.43 0.01
Skewness 4.00 0.73 0.17 0.24 -0.22 -0.38
SE.Skewness 0.05 0.05 0.05 0.05 0.05 0.05
Kurtosis 28.31 0.07 -0.54 0.93 0.23 1.80
N.Valid 2919.00 2919.00 2919.00 2918.00 2918.00 2760.00
Pct.Valid 100.00 100.00 100.00 99.97 99.97 94.55
Table: Table continues below
GrLivArea HalfBath Id KitchenAbvGr LotArea LotFrontage
----------------- ----------- ---------- --------- -------------- ----------- -------------
Mean 1500.76 0.38 1460.00 1.04 10168.11 69.31
Std.Dev 506.05 0.50 842.79 0.21 7887.00 23.34
Min 334.00 0.00 1.00 0.00 1300.00 21.00
Q1 1126.00 0.00 730.00 1.00 7476.00 59.00
Median 1444.00 0.00 1460.00 1.00 9453.00 68.00
Q3 1744.00 1.00 2190.00 1.00 11577.00 80.00
Max 5642.00 2.00 2919.00 3.00 215245.00 313.00
MAD 464.05 0.00 1082.30 0.00 3023.02 17.79
IQR 617.50 1.00 1459.00 0.00 4092.00 21.00
CV 0.34 1.32 0.58 0.21 0.78 0.34
Skewness 1.27 0.69 0.00 4.30 12.82 1.50
SE.Skewness 0.05 0.05 0.05 0.05 0.05 0.05
Kurtosis 4.11 -1.04 -1.20 19.73 264.31 11.26
N.Valid 2919.00 2919.00 2919.00 2919.00 2919.00 2433.00
Pct.Valid 100.00 100.00 100.00 100.00 100.00 83.35
Table: Table continues below
LowQualFinSF MasVnrArea MiscVal MoSold MSSubClass OpenPorchSF
----------------- -------------- ------------ ---------- --------- ------------ -------------
Mean 4.69 102.20 50.83 6.21 57.14 47.49
Std.Dev 46.40 179.33 567.40 2.71 42.52 67.58
Min 0.00 0.00 0.00 1.00 20.00 0.00
Q1 0.00 0.00 0.00 4.00 20.00 0.00
Median 0.00 0.00 0.00 6.00 50.00 26.00
Q3 0.00 164.00 0.00 8.00 70.00 70.00
Max 1064.00 1600.00 17000.00 12.00 190.00 742.00
MAD 0.00 0.00 0.00 2.97 44.48 38.55
IQR 0.00 164.00 0.00 4.00 50.00 70.00
CV 9.88 1.75 11.16 0.44 0.74 1.42
Skewness 12.08 2.60 21.94 0.20 1.37 2.53
SE.Skewness 0.05 0.05 0.05 0.05 0.05 0.05
Kurtosis 174.51 9.23 562.72 -0.46 1.45 10.91
N.Valid 2919.00 2896.00 2919.00 2919.00 2919.00 2919.00
Pct.Valid 100.00 99.21 100.00 100.00 100.00 100.00
Table: Table continues below
OverallCond OverallQual PoolArea ScreenPorch TotalBsmtSF TotRmsAbvGrd
----------------- ------------- ------------- ---------- ------------- ------------- --------------
Mean 5.56 6.09 2.25 16.06 1051.78 6.45
Std.Dev 1.11 1.41 35.66 56.18 440.77 1.57
Min 1.00 1.00 0.00 0.00 0.00 2.00
Q1 5.00 5.00 0.00 0.00 793.00 5.00
Median 5.00 6.00 0.00 0.00 989.50 6.00
Q3 6.00 7.00 0.00 0.00 1302.00 7.00
Max 9.00 10.00 800.00 576.00 6110.00 15.00
MAD 0.00 1.48 0.00 0.00 350.63 1.48
IQR 1.00 2.00 0.00 0.00 509.00 2.00
CV 0.20 0.23 15.84 3.50 0.42 0.24
Skewness 0.57 0.20 16.89 3.94 1.16 0.76
SE.Skewness 0.05 0.05 0.05 0.05 0.05 0.05
Kurtosis 1.47 0.06 297.91 17.73 9.13 1.16
N.Valid 2919.00 2919.00 2919.00 2919.00 2918.00 2919.00
Pct.Valid 100.00 100.00 100.00 100.00 99.97 100.00
Table: Table continues below
WoodDeckSF X1stFlrSF X2ndFlrSF X3SsnPorch YearBuilt YearRemodAdd YrSold
----------------- ------------ ----------- ----------- ------------ ----------- -------------- ---------
Mean 93.71 1159.58 336.48 2.60 1971.31 1984.26 2007.79
Std.Dev 126.53 392.36 428.70 25.19 30.29 20.89 1.31
Min 0.00 334.00 0.00 0.00 1872.00 1950.00 2006.00
Q1 0.00 876.00 0.00 0.00 1953.00 1965.00 2007.00
Median 0.00 1082.00 0.00 0.00 1973.00 1993.00 2008.00
Q3 168.00 1388.00 704.00 0.00 2001.00 2004.00 2009.00
Max 1424.00 5095.00 2065.00 508.00 2010.00 2010.00 2010.00
MAD 0.00 348.41 0.00 0.00 37.06 20.76 1.48
IQR 168.00 511.50 704.00 0.00 47.50 39.00 2.00
CV 1.35 0.34 1.27 9.68 0.02 0.01 0.00
Skewness 1.84 1.47 0.86 11.37 -0.60 -0.45 0.13
SE.Skewness 0.05 0.05 0.05 0.05 0.05 0.05 0.05
Kurtosis 6.72 6.94 -0.43 149.05 -0.51 -1.35 -1.16
N.Valid 2919.00 2919.00 2919.00 2919.00 2919.00 2919.00 2919.00
Pct.Valid 100.00 100.00 100.00 100.00 100.00 100.00 100.00
# Define the R function to get missing value counts
get_missing_value_counts <- function(data_frame) {
missing_counts <- colSums(is.na(data_frame))
missing_counts <- missing_counts[missing_counts > 0]
missing_counts <- sort(missing_counts, decreasing = TRUE)
percent <- colSums(is.na(data_frame)) / nrow(data_frame)
percent <- percent[percent > 0]
percent <- sort(percent, decreasing = TRUE)
missing_data <- data.frame(Missing_counts = missing_counts, Percent = percent)
return(missing_data)
}
# Call the R function and print the missing value counts
train_missing_values <- get_missing_value_counts(housejoin)
print(train_missing_values)
# Set the threshold for missing values (2,5% in this example)
house_prices <- housejoin
threshold <- 70
# Calculate the number of missing values in each column
missing_values <- colSums(is.na(house_prices))
# Get the names of columns where missing values exceed the threshold >70
columns_to_remove <- names(house_prices)[missing_values > threshold]
# Remove the selected columns from the dataset
house_prices <- house_prices[, !names(house_prices) %in% columns_to_remove]
train_missing_values <- get_missing_value_counts(house_prices)
print(train_missing_values)
# Identify character columns
char_col <- sapply(house_prices, is.character)
# Convert character columns to factors
house_prices[char_col] <- lapply(house_prices[char_col], as.factor)
# Identify factor (categorical) columns
factor_cols <- sapply(house_prices, is.factor)
# Loop through each factor column and replace missing values with the mode
for (col in names(house_prices)[factor_cols]) {
mode_val <- names(sort(table(house_prices[[col]]), decreasing = TRUE))[1] # Find the mode
house_prices[is.na(house_prices[[col]]), col] <- mode_val
# Identify integer columns
integer_cols <- sapply(house_prices, is.integer)
# Loop through each integer column and replace missing values with the median
for (col in names(house_prices)[integer_cols]) {
median_val <- median(house_prices[[col]], na.rm = TRUE) # Calculate the median
house_prices[[col]][is.na(house_prices[[col]])] <- median_val # Replace missing values with the median
}
}
print(house_prices)
ggplot(trainori, aes(x = SalePrice)) +
geom_histogram(aes(y = after_stat(density)), binwidth = 15000, colour = "black", fill = "white") +
geom_density(alpha = .2, fill="#FF6666") +
ggtitle("Histogram and Density Plot of Sale Price") +
xlab("Sale Price") + ylab("Frequency")
descr(trainori$SalePrice)
Descriptive Statistics
trainori$SalePrice
N: 1460
SalePrice
----------------- -----------
Mean 180921.20
Std.Dev 79442.50
Min 34900.00
Q1 129950.00
Median 163000.00
Q3 214000.00
Max 755000.00
MAD 56338.80
IQR 84025.00
CV 0.44
Skewness 1.88
SE.Skewness 0.06
Kurtosis 6.50
N.Valid 1460.00
Pct.Valid 100.00
# List all the data types in the dataset
df_num <- house_prices[, !colnames(house_prices) %in% c("Id")]
data_types <- sapply(df_num, class)
# Select only the numerical features
df_num <- df_num[, data_types %in% c("numeric", "integer")]
# Set up the plot parameters for a 5x7 grid
par(mfrow=c(12, 3), mar=c(4, 4, 2, 1), oma=c(0, 0, 2, 0))
# Create histograms for each integer feature
for (i in 1:ncol(df_num)) {
hist(df_num[, i], main="", xlab=colnames(df_num)[i], ylab="Frequency", breaks=50, col="blue")
}
# Reset plot parameters
par(mfrow=c(1, 1), mar=c(5, 4, 4, 2) + 0.1, oma=c(0, 0, 0, 0))
# Identify categorical columns in your dataset
categorical_columns <- sapply(house_prices, is.factor)
# Create dummy variables for categorical columns
data_dummies <- model.matrix(~ . - 1, data = house_prices[, categorical_columns])
# Combine the dummy variables with the numeric columns
data_processed <- cbind(house_prices[, !categorical_columns], data_dummies)
# Now, 'data_processed' contains the dataset with dummy variables for categorical columns
# You may want to rename the columns for clarity, for example:
colnames(data_processed) <- gsub("data_dummies", "", colnames(data_processed))
# Check the first few rows of the processed data
print(data_processed)
# remove ID columns
dataset <- data_processed
dataset <- dataset[, !colnames(dataset) %in% c("Id")]
SalePrice <- sale_price
#split
datahousetrain <- cbind(dataset[1:1460,],SalePrice)
set.seed(123) # Set a seed for reproducibility
split_ratio <- 0.7 # Adjust the ratio as needed
n <- nrow(datahousetrain)
n_train <- round(n * split_ratio)
train_inner <- datahousetrain[1:n_train, ]
test_inner <- datahousetrain[(n_train + 1):n, ]
test_actual <- dataset[1461:2919,]
print(train_inner)
x <- train_inner[,1:197]
y <- train_inner[,198]
# Never forget to exclude objective variable in 'data option'
train_Data <- xgb.DMatrix(data = as.matrix(x), label = y)
params <- list(
objective = "reg:squarederror",
booster = "gbtree",
eval_metric = "rmse",
eta = 0.3, # Learning rate
max_depth = 12,
min_child_weight = 1,
subsample = 0.8,
colsample_bytree = 0.8
)
xgb_model <- xgboost(params = params, data = train_Data,
nrounds = 400, print_every_n = 100)
[1] train-rmse:142617.389208
[101] train-rmse:3.409170
[201] train-rmse:0.019425
[301] train-rmse:0.018439
[400] train-rmse:0.018129
#1. MEAN ABSOLUTE PERCENTAGE ERROR (MAPE)
MAPE = function(y_actual,y_predict){
mean(abs((y_actual-y_predict)/y_actual))*100
}
#2. R SQUARED error metric -- Coefficient of Determination
RSQUARE = function(y_actual,y_predict){
cor(y_actual,y_predict)^2
}
train_pred <- predict(xgb_model,train_Data)
rmse_train <- RMSE(y,train_pred)
r2_train <- RSQUARE(y,train_pred)
mape_train <- MAPE(y,train_pred)
namesmetrictrain <- c("RMSE Train","R Squared Train","MAPE Train")
scoremetrictrain <- c(rmse_train,r2_train,mape_train)
resultstrain<- as.data.frame(cbind(namesmetrictrain,scoremetrictrain))
colnames(resultstrain) <- c("Results","Score")
print(resultstrain)
NA
x_test <- test_inner[,1:197]
y_test <- test_inner[,198]
test_Data <- xgb.DMatrix(data = as.matrix(x_test))
test_pred <- predict(xgb_model,test_Data)
rmse_test <- RMSE(y_test,test_pred)
r2_test <- RSQUARE(y_test,test_pred)
mape_test <- MAPE(y_test,test_pred)
namesmetrictest <- c("RMSE Test","R Squared Test","MAPE Test")
scoremetrictest<- c(rmse_test,r2_test,mape_test)
resultstest <- as.data.frame(cbind(namesmetrictest,scoremetrictest))
colnames(resultstest) <- c("Results","Score")
print(resultstest)
testactual_xgb <- xgb.DMatrix(data = as.matrix(test_actual))
pred_testactual <- predict(xgb_model,testactual_xgb)
submission_xgb <- as.data.frame(cbind(submission$Id,pred_testactual))
colnames(submission_xgb) <- c("Id","PredictedSalePrice")
print(submission_xgb)
NA