# Loading required libraries
library(DT)
library(stringr)
# Load csv data
df <- read.csv("C:/Users/adity/Downloads/nanaimo.csv")
# Display top 8 columns from dataset
datatable(df[, c("X","address","price","area","bath","taxes", "taxyear","stratafee")])
Correlation provides us the strength of the relationship between 2 variables.
A negative correlation implies that, if one variable increases the other decreases and vice versa.
On the other hand, a positive correlation implies that if one variable increases, then the other one also increases and if one decreases the other one decreases as well.
For categorical data, we will not be able to test for correlation between string(character) and numeric data. Hence, we will encode the unique values (Eg: Powell River: 0 , Parksville: 1 …)
# To store numeric cols for finding correlation to target variable
numeric_cols <- c()
# All columns present in the dataframe
df_columns <- names(df)
for (col_name in df_columns){
if (is.numeric(df[,col_name]) == TRUE){
numeric_cols <- append(numeric_cols, col_name)
}
}
numeric_cols
## [1] "X" "price" "mls" "lat" "lng" "bed"
## [7] "area" "bath" "landarea" "lotwidth" "lotdepth" "age"
## [13] "taxes" "taxyear" "stratafee"
# Store correlation values of respective Columns
corr_values = c()
# Iterate through each column, and append calculated correlation score w.r.t to 'price' column
for (num_col in numeric_cols){
calc_corr <- cor(df[,num_col], df[,"price"], use='complete.obs')
corr_values <- append(corr_values, calc_corr)
}
# Create new dataframe, combining Column name and Correlation values
corr_df <- data.frame(numeric_cols, corr_values)
corr_df_sorted <- corr_df[order(-corr_values),]
corr_df_sorted
## numeric_cols corr_values
## 2 price 1.00000000
## 1 X 0.94017538
## 7 area 0.67467852
## 13 taxes 0.67407152
## 8 bath 0.52809544
## 6 bed 0.43519572
## 11 lotdepth 0.30078896
## 9 landarea 0.30044462
## 3 mls 0.29631017
## 10 lotwidth 0.23141636
## 12 age 0.22475732
## 5 lng 0.13626407
## 14 taxyear 0.09430585
## 4 lat 0.02795866
## 15 stratafee -0.33306197
# Correlation of 'area' with 'price'
area_price_corr <- cor(df$area, df$price, use="complete.obs")
cat("Correlation of Area with Price:",area_price_corr)
## Correlation of Area with Price: 0.6746785
library(ggplot2)
plot_1 <- ggplot(data=df,mapping=aes(x=area, y=price)) + geom_point() + ggtitle("Area w.r.t Price")
plot_1
Figure 1: Plotting Area values wrt to Price
plot_1 <- ggplot(data=df,mapping=aes(x=area, y=price)) + geom_point() + geom_smooth() + ggtitle("Area w.r.t Price - Regression line")
plot_1
Figure 2: Adding a best fit line - Area values wrt to Price
plot_1 <- ggplot(data=df,mapping=aes(x=area, y=price)) + geom_point() + geom_smooth(method="lm",se=FALSE) + ggtitle("Area w.r.t Price - Regression Line (Using Linear model)")
plot_1
Figure 3 : Adding a regression line (using lm) - Area values wrt to Price
# Using lm function to create a regression model
price_area_summ <- lm(formula=price~area, data=df)
summary(price_area_summ)
##
## Call:
## lm(formula = price ~ area, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -354270 -98622 -17701 63728 572017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.259e+05 1.294e+04 9.728 <2e-16 ***
## area 1.305e+02 6.782e+00 19.239 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 156800 on 443 degrees of freedom
## (104 observations deleted due to missingness)
## Multiple R-squared: 0.4552, Adjusted R-squared: 0.454
## F-statistic: 370.1 on 1 and 443 DF, p-value: < 2.2e-16
From Figure 3 we can see that the feature ‘price’ and area are highly correlated, i.e as area tends to increase, the price of the house also increases.
Thus, we can say that there is a positive correlation among the 2 features.
Also, from the summary generated using the lm function, we can observe that if the area is approx 130 sq.feet, the price of the house is roughly $125,900.
# Correlation of 'taxes' with 'price'
taxes_price_corr <- cor(df$taxes, df$price, use="complete.obs")
cat("Correlation of taxes with Price:", taxes_price_corr)
## Correlation of taxes with Price: 0.6740715
tax_price_plot <- ggplot(data=df,mapping=aes(x=taxes, y=price)) + geom_point() + ggtitle("Tax w.r.t Price")
tax_price_plot
Figure 4 : Plotting Taxes values wrt to Price
tax_price_plot_smooth <- ggplot(data=df,mapping=aes(x=taxes, y=price)) + geom_point() + geom_smooth() + ggtitle("Tax w.r.t Price - Regression line")
tax_price_plot_smooth
Figure 5 : Best fit line - Area values wrt to Price
tax_price_plot_smooth_lm <- ggplot(data=df,mapping=aes(x=taxes, y=price)) + geom_point() + geom_smooth(method="lm",se=FALSE) + ggtitle("Tax w.r.t Price - Regression Line (Using Linear model)")
tax_price_plot_smooth_lm
Figure 6 : Regression line - Area values wrt to Price
# Using lm function to create a regression model
tax_price_summ <- lm(price~taxes, data=df)
summary(tax_price_summ)
##
## Call:
## lm(formula = price ~ taxes, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -462915 -95522 -27534 62873 809417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58266.946 14743.118 3.952 8.91e-05 ***
## taxes 112.242 5.614 19.993 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 156900 on 480 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.4544, Adjusted R-squared: 0.4532
## F-statistic: 399.7 on 1 and 480 DF, p-value: < 2.2e-16
From the above correlation value, we can say that ‘area’ and ‘tax’ feature has a high correlation to ‘price’, i.e As the Area & taxes of the house increases, the Price of the house also increases.
Also from Figure 6, we can observe that there is a positive correlation among the two variables.
We can also observe that, if the taxes value is close to $112, the price is roughly $58,266.
# Splitting address into new column -> 'Location'
split_addr_by_comma <- function(x) str_trim(unlist(strsplit(x,","))[2])
df$Location <- mapply(split_addr_by_comma, df$address)
head(data.frame(Original_Address=df$address, Location_derived=df$Location))
## Original_Address Location_derived
## 1 122 - 1175 Resort Drive, Parksville Parksville
## 2 26 - 4837 Roger Street, Port Alberni Port Alberni
## 3 106c - 181 Beachside Drive, Parksville Parksville
## 4 Lot 15 Fernwood Ave, Powell River Powell River
## 5 A11 - 2157 Regent Road, Black Creek Black Creek
## 6 38 - 5455 Borden Place, Powell River Powell River
# Check all Unique values in new column 'Location'
unique(df$Location)
## [1] "Parksville" "Port Alberni" "Powell River" "Black Creek"
## [5] "Nanoose Bay" "Ucluelet" "Gabriola Island" "Savary Island"
## [9] "Errington" "Nanaimo" "Coombs" "Ladysmith"
## [13] "Duncan" "Campbell River" "Courtenay" "Denman Island"
## [17] "Texada Island" "Hornby Island" "Bamfield" "Youbou"
## [21] "Lasqueti Island" "Cumberland" "Saltair" "Ruxton Island"
## [25] "Comox" "French Creek" "Tofino" "47"
## [29] "Qualicum Beach" "Bowser" "Chemainus" "Union Bay"
## [33] "Merville" "Lantzville"
# Select row with outlier where Location value is "47"
datatable(df[which(df$Location == "47"),c("address", "Location")])
# Replacing original value of location by considering last occurence of "," in given Address
comma_indexes <- which(strsplit(df[which(df$Location == "47"),"address"],"")[[1]] == ",")
# Iterate through all indices, where "," is present in address value
# Eg: Address: Lots - 46,47,48 Marine Ave, Powell River -> Split on last occurence of "," -> location: Powell River
for (idx in comma_indexes){
# Check index of last occurence of "," and include substring which occurs after the "," character
if (idx == comma_indexes[length(comma_indexes)]){
df$Location[302] <- str_trim(str_sub(df$address[302],start=idx+1))
}
}
# Print new Location value
df$Location[302]
## [1] "Powell River"
# Label encoding
df$Location_encoded <- as.numeric(factor(df$Location))
head(df[,c("address","Location", "Location_encoded")])
## address Location Location_encoded
## 1 122 - 1175 Resort Drive, Parksville Parksville 22
## 2 26 - 4837 Roger Street, Port Alberni Port Alberni 23
## 3 106c - 181 Beachside Drive, Parksville Parksville 22
## 4 Lot 15 Fernwood Ave, Powell River Powell River 24
## 5 A11 - 2157 Regent Road, Black Creek Black Creek 2
## 6 38 - 5455 Borden Place, Powell River Powell River 24
loc_price_corr <- cor(df$Location_encoded, df$price, use="complete.obs")
cat("Correlation of Location wrt Price:", loc_price_corr)
## Correlation of Location wrt Price: -0.1620253
# New feature wrt to Price - Using Area, age, Total extra charges (Tax + strata fee)
#cor(df$area, df$price, use="complete.obs")
df$Combined_taxes_strata_fee <- df$taxes + df$stratafee
taxes_plus_strata_corr <- cor(df$Combined_taxes_strata_fee, df$price, use="complete.obs")
taxes_plus_strata_corr
## [1] 0.8162398
combined_taxes_strata_fee_lm = lm(price~Combined_taxes_strata_fee, data=df)
summary(combined_taxes_strata_fee_lm)
##
## Call:
## lm(formula = price ~ Combined_taxes_strata_fee, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -332070 -41836 -1010 24121 562603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36898.84 29542.23 -1.249 0.216
## Combined_taxes_strata_fee 123.02 10.05 12.236 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122500 on 75 degrees of freedom
## (472 observations deleted due to missingness)
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6618
## F-statistic: 149.7 on 1 and 75 DF, p-value: < 2.2e-16
We can immediately notice that the combination of the two columns viz. taxes and strata fee produces a higher correlation score, as opposed to treating each variable independently.
library(ggplot2)
scatter_plot_taxes <- ggplot(data=df,mapping=aes(x=Combined_taxes_strata_fee, y=price)) + geom_point() + ggtitle("Total Price as a function of Taxes + Strata fee")
scatter_plot_taxes
Figure 7 : Plotting New column(Combined_taxes_strata_fee) values wrt to Price
plot_1 <- ggplot(data=df,mapping=aes(x=Combined_taxes_strata_fee, y=price, color=Location)) + geom_point() + geom_smooth() + ggtitle("Total Price as a function of Taxes + Strata fee")
plot_1
Figure 8 : Best fit line : New column(Combined_taxes_strata_fee) values wrt to Price
plot_1 <- ggplot(data=df,mapping=aes(x=Combined_taxes_strata_fee, y=price)) + geom_point() + geom_smooth(method="lm",se=FALSE) + ggtitle("Total Price as a function of Taxes + Strata fee - Regression Line (Least squares line)")
plot_1
Figure 9 : Regression Line: New column(Combined_taxes_strata_fee) values wrt to Price
From Figure 9, we can observe that there is a positive correlation among the newly created feature (Taxes + strata fee) with the price feature.
Also, the correlation of the newly created feature tends to be higher, as opposed to treating each feature independently.