Part 1:reading external data and storing into a dataframe called “houseprices.df”
library(data.table)
houseprices.df <- read.csv("DATA_Unique_V5.csv")
dt <- fread(input="DATA_Unique_V5.csv",stringsAsFactors=TRUE)
attach(houseprices.df)
Part2: Data Dimensions
#Display the data dimensions
dim(houseprices.df)
## [1] 1910 26
Part 3: Column names
# Display the column names
colnames(houseprices.df)
## [1] "Owner" "Area" "MacroArea"
## [4] "dataId" "Property.Link" "No..of.Rooms"
## [7] "No..of.Bathrooms" "Address.of.Locality" "Builder.Name"
## [10] "Rent" "No..of.Photos" "Furnishing.status"
## [13] "Floor.no" "Verified.Status" "Price.in.Integer"
## [16] "Latitude" "Longitude" "Super.Area"
## [19] "Carpert.Area" "Car.Parking" "Floor.Details"
## [22] "Total.Floors" "Flooring" "Facing"
## [25] "Tenants" "Rent_Final"
Part 4: Descriptive Statistics of the dataframe
library(psych)
describe(houseprices.df)
## vars n mean sd median
## Owner* 1 1910 4.14 1.94 4.00
## Area* 2 1910 13.87 7.10 14.00
## MacroArea* 3 1910 3.33 1.95 3.00
## dataId 4 1910 42430243.52 5428863.90 44558085.00
## Property.Link* 5 1910 955.50 551.51 955.50
## No..of.Rooms 6 1910 1.68 0.47 2.00
## No..of.Bathrooms 7 1910 1.86 0.41 2.00
## Address.of.Locality* 8 1910 113.89 65.49 107.00
## Builder.Name* 9 1910 66.75 85.84 1.00
## Rent* 10 1910 88.87 43.47 88.00
## No..of.Photos* 11 1910 18.31 13.72 13.00
## Furnishing.status* 12 1910 2.16 0.75 2.00
## Floor.no 13 1702 6.55 5.60 5.00
## Verified.Status* 14 1910 10.05 22.62 1.00
## Price.in.Integer* 15 1910 130.40 63.50 128.00
## Latitude 16 1910 14.92 10.61 19.11
## Longitude 17 1910 53.41 32.02 72.84
## Super.Area 18 1857 911.03 272.29 950.00
## Carpert.Area* 19 1732 160.07 75.47 166.00
## Car.Parking* 20 1910 8.18 7.37 3.00
## Floor.Details* 21 1910 279.80 169.49 329.00
## Total.Floors 22 1702 13.26 9.25 11.00
## Flooring* 23 1910 57.19 43.67 71.00
## Facing* 24 1910 3.25 2.49 2.00
## Tenants* 25 1910 1.71 0.81 1.00
## Rent_Final 26 1910 44245.03 26914.36 40000.00
## trimmed mad min max
## Owner* 4.18 1.48 1 7.00
## Area* 13.87 8.90 1 26.00
## MacroArea* 3.28 2.97 1 6.00
## dataId 43788127.47 1305071.99 6743602 45622317.00
## Property.Link* 955.50 707.94 1 1910.00
## No..of.Rooms 1.73 0.00 1 2.00
## No..of.Bathrooms 1.92 0.00 1 5.00
## Address.of.Locality* 112.73 84.51 1 228.00
## Builder.Name* 52.64 0.00 1 268.00
## Rent* 89.27 47.44 1 178.00
## No..of.Photos* 18.01 16.31 1 38.00
## Furnishing.status* 2.20 1.48 1 3.00
## Floor.no 5.74 4.45 -2 44.00
## Verified.Status* 3.42 0.00 1 99.00
## Price.in.Integer* 131.36 71.16 1 253.00
## Latitude 15.38 0.12 0 72.92
## Longitude 57.65 0.07 0 72.97
## Super.Area 909.06 303.93 60 2079.00
## Carpert.Area* 162.13 90.44 1 293.00
## Car.Parking* 7.98 2.97 1 17.00
## Floor.Details* 285.63 173.46 1 547.00
## Total.Floors 11.90 5.93 1 63.00
## Flooring* 58.33 40.03 1 116.00
## Facing* 2.82 1.48 1 9.00
## Tenants* 1.64 0.00 1 3.00
## Rent_Final 40985.37 20756.40 0 225000.00
## range skew kurtosis se
## Owner* 6.00 -0.22 -0.98 0.04
## Area* 25.00 0.00 -1.07 0.16
## MacroArea* 5.00 0.10 -1.51 0.04
## dataId 38878715.00 -2.94 9.35 124220.21
## Property.Link* 1909.00 0.00 -1.20 12.62
## No..of.Rooms 1.00 -0.78 -1.39 0.01
## No..of.Bathrooms 4.00 -0.78 3.11 0.01
## Address.of.Locality* 227.00 0.15 -1.17 1.50
## Builder.Name* 267.00 0.98 -0.45 1.96
## Rent* 177.00 -0.02 -0.73 0.99
## No..of.Photos* 37.00 0.15 -1.71 0.31
## Furnishing.status* 2.00 -0.27 -1.20 0.02
## Floor.no 46.00 1.91 5.86 0.14
## Verified.Status* 98.00 2.44 4.70 0.52
## Price.in.Integer* 252.00 -0.05 -0.82 1.45
## Latitude 72.92 1.39 9.23 0.24
## Longitude 72.97 -1.04 -0.89 0.73
## Super.Area 2019.00 0.12 0.05 6.32
## Carpert.Area* 292.00 -0.24 -1.09 1.81
## Car.Parking* 16.00 0.32 -1.83 0.17
## Floor.Details* 546.00 -0.40 -1.16 3.88
## Total.Floors 62.00 1.56 2.99 0.22
## Flooring* 115.00 -0.31 -1.68 1.00
## Facing* 8.00 1.42 0.70 0.06
## Tenants* 2.00 0.57 -1.24 0.02
## Rent_Final 225000.00 1.56 4.11 615.84
Part 5: Data structures
# Display the data structures
str(houseprices.df)
## 'data.frame': 1910 obs. of 26 variables:
## $ Owner : Factor w/ 7 levels "Apoorv","dhonde",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Area : Factor w/ 26 levels "andheri","Andheri",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ MacroArea : Factor w/ 6 levels "Central","East",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ dataId : int 44414015 44653729 38459903 44075693 45342967 45196703 44470003 45052161 43507931 45464203 ...
## $ Property.Link : Factor w/ 1910 levels "https://www.magicbricks.com/propertyDetails/1-BHK-1000-Sq-ft-Builder-Floor-Apartment-FOR-Rent-Santacruz-West-in"| __truncated__,..: 528 1106 915 1341 1380 1067 897 1105 830 752 ...
## $ No..of.Rooms : int 1 2 2 2 2 2 2 2 2 2 ...
## $ No..of.Bathrooms : int 1 2 2 2 2 2 2 3 2 2 ...
## $ Address.of.Locality: Factor w/ 228 levels "4 Bunglows","Aarey Milk Colony",..: 122 9 29 9 65 122 9 8 136 9 ...
## $ Builder.Name : Factor w/ 268 levels "","A M Enterprises",..: 1 81 257 11 11 58 1 1 1 44 ...
## $ Rent : Factor w/ 178 levels "","1","1 Lac",..: 92 129 111 169 177 124 152 124 88 141 ...
## $ No..of.Photos : Factor w/ 38 levels "0","1","10","11",..: 36 35 6 38 38 38 3 8 9 4 ...
## $ Furnishing.status : Factor w/ 3 levels "Furnished","Semi-Furnished",..: 1 2 2 2 2 3 3 2 3 3 ...
## $ Floor.no : int 1 5 NA 9 25 19 1 7 8 12 ...
## $ Verified.Status : Factor w/ 99 levels "","100000","105000",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Price.in.Integer : Factor w/ 253 levels "0","1 Lac","1.1 Lac",..: 136 190 162 243 252 183 217 183 128 208 ...
## $ Latitude : num 0 19.1 19.1 0 19.1 ...
## $ Longitude : num 0 72.8 72.9 0 72.8 ...
## $ Super.Area : int 700 1200 1100 1361 1499 1160 1100 1200 1050 1040 ...
## $ Carpert.Area : Factor w/ 293 levels "1000","1015",..: 135 221 234 260 292 163 260 243 256 193 ...
## $ Car.Parking : Factor w/ 17 levels "1 Covered","1 Covered, 1 Open",..: 3 1 3 6 6 1 3 1 17 3 ...
## $ Floor.Details : Factor w/ 547 levels "","-1 (11 )",..: 40 392 1 529 287 220 40 462 488 105 ...
## $ Total.Floors : int 7 13 NA 31 29 22 7 16 10 18 ...
## $ Flooring : Factor w/ 116 levels "","Ceramic Tiles",..: 98 32 98 98 75 98 115 104 98 1 ...
## $ Facing : Factor w/ 9 levels "","East","North",..: 3 4 2 1 2 4 2 2 2 1 ...
## $ Tenants : Factor w/ 3 levels "","Bachelors",..: 1 1 2 3 3 3 3 1 1 1 ...
## $ Rent_Final : int 42000 58000 48100 90000 98000 55000 72000 55000 40000 65000 ...
part 6: Histogram/Density Plot/Quantiles/Normality plots of Rent
# plotting histogram
hist(houseprices.df$Rent_Final,main = "Histogram of variable Rent",
xlab = "Rent",col = c("gray"))

plot(density(Rent_Final), frame = TRUE,
main = "Density Plot of Rent")
polygon(density(Rent_Final), col = "black")

# normality plot using qqnorm and qqline
qqnorm(Rent_Final)
qqline(Rent_Final)

# quantiles
quantile(Rent_Final)
## 0% 25% 50% 75% 100%
## 0 26125 40000 55000 225000
part 7: boxplot of rent
boxplot(houseprices.df$Rent_Final,width = 0.5,
horizontal = TRUE,main = "boxplot for variable Rent",
xlab = "Rent",col = c("lightblue"))

part 8: Barplot of Frequency of data by macro region and region
counts1 <- table(MacroArea)
barplot(counts1, main="MacroArea",
xlab="")

counts2 <- table(Area)
barplot(counts2, main="Area",
xlab="")

part 9: Mean, std dev and other rent descriptives
describe(Rent_Final)
## vars n mean sd median trimmed mad min max range
## X1 1 1910 44245.03 26914.36 40000 40985.37 20756.4 0 225000 225000
## skew kurtosis se
## X1 1.56 4.11 615.84
part 10 Boxplot for Rent based on macro areas
boxplot(Rent_Final ~ MacroArea , data = houseprices.df,
main = "Boxplot for rent grouped by MacroArea",col=(c("Blue","red","gray","black")))

part 11 Boxplot of rent split by no. of rooms
boxplot(Rent_Final~No..of.Rooms, xlab = "No. of rooms", ylab = "Rent", main= "Boxplot of rent split by bedrooms")

part 12 Boxplot of rent split by no. of Bathrooms
boxplot(Rent_Final~No..of.Bathrooms, xlab = "No. of Bathrooms", ylab = "Rent", main= "Boxplot of rent split by no. of Bathrooms")

part 13 Trend of (Price/SuperArea) with macroregion
boxplot((Rent_Final/Super.Area)~MacroArea, xlab = "MacroRegion", ylab = "Rent/Super Area", main= "Boxplot of (Rent/Superarea) by macroregion")

Part 14: Scatter Plot of Area and MacroArea with Rent
# loading the package
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
options(scipen = 10000)
# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = Area, y = Rent_Final)) +
geom_point()
p + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Area") + labs(y = "Rent") + labs(title="Scatterplot of Area and Rent")

# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = MacroArea, y = Rent_Final)) +
geom_point()
p + labs(x = "Macro Area") + labs(y = "Rent") + labs(title="Scatterplot of Macro Area and Rent")

Part 15: Scatter Plot of (Rent/Area) and MacroArea with Rent
Rent_per_sqFtarea = Rent_Final/Super.Area
# loading the package
library(ggplot2)
options(scipen = 10000)
# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = Area, y = Rent_per_sqFtarea)) +
geom_point()
p + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Area") + labs(y = "Rent per sq ft") + labs(title="Scatterplot of Area and Rent per sq ft")
## Warning: Removed 53 rows containing missing values (geom_point).

# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = MacroArea, y = Rent_per_sqFtarea)) +
geom_point()
p + labs(x = "Macro Area") + labs(y = "Rent per sq ft") + labs(title="Scatterplot of Macro Area and Rent per sq ft")
## Warning: Removed 53 rows containing missing values (geom_point).

Part 16: Scatter Plot of No of photos and Rent
No..of.Photos = as.numeric(No..of.Photos)
plot(No..of.Photos , Rent_Final,main = "Plot of No of Photos and Rent",xlab = "No of Photos", ylab = "Rent" )

Part 17,18, 19: Scatter Plot of Total Floor, Floor No and Floor/Total and Rent
plot(Total.Floors , Rent_Final,main = "Plot of Total Floors and Rent",xlab = "Floor No", ylab = "Rent" )

Floor.no = as.numeric(Floor.no)
plot(Floor.no , Rent_Final,main = "Plot of Floor No and Rent",xlab = "Floor No", ylab = "Rent" )

Floor.Location = Floor.no / Total.Floors
plot(Floor.Location , Rent_Final,main = "Plot of (Floor No/Total Floors) and Rent",xlab = "(Floor No / Total Floors)", ylab = "Rent" )

Part 20 and 21: Scatter Plot of Super Area, Carpet Area with Rent
plot(Super.Area, Carpert.Area)

plot(Super.Area, Rent_Final,main = "Plot of Super Area and Rent",xlab = "Super Area", ylab = "Rent" )

Carpert.Area = as.numeric(Carpert.Area)
plot(Carpert.Area, Rent_Final,main = "Plot of Carpet Area and Rent",xlab = "Carpet Area", ylab = "Rent" )

Part 22: Rent Contingency tables with respect to no of rooms & bathrooms
aggregate(houseprices.df$Rent_Final,
by = list(houseprices.df$No..of.Rooms),mean)
## Group.1 x
## 1 1 25960.70
## 2 2 52762.75
aggregate(houseprices.df$Rent_Final,
by = list(houseprices.df$No..of.Bathrooms),mean)
## Group.1 x
## 1 1 27314.48
## 2 2 47042.26
## 3 3 68166.67
## 4 5 75000.00
#library(dplyr)
#group <- group_by(houseprices.df,`No. of Rooms`,`No. of Bathrooms`)
#summarise(group, count = n(),
#mean = mean(Rent_Final, na.rm = TRUE),
#sd = sd(Rent_Final, na.rm = TRUE))
Part 23
boxplot(Rent_Final ~ Facing , data = houseprices.df,
main = "Boxplot for rent grouped by Facing direction",col=(c("Blue","red","green","yellow")),las=2)

Part 24: Correlation Matrix for all the Continuous Variable
houseprices.df[,c(6,7,11,13,18,19,22,26)] <- sapply(houseprices.df[,c(6,7,11,13,18,19,22,26)],as.numeric)
library(psych)
corr.test(houseprices.df[,c(6,7,11,13,18,19,22,26)],use="complete")
## Call:corr.test(x = houseprices.df[, c(6, 7, 11, 13, 18, 19, 22, 26)],
## use = "complete")
## Correlation matrix
## No..of.Rooms No..of.Bathrooms No..of.Photos Floor.no
## No..of.Rooms 1.00 0.56 -0.03 0.26
## No..of.Bathrooms 0.56 1.00 -0.04 0.20
## No..of.Photos -0.03 -0.04 1.00 0.01
## Floor.no 0.26 0.20 0.01 1.00
## Super.Area 0.77 0.52 -0.04 0.28
## Carpert.Area 0.73 0.44 -0.05 0.27
## Total.Floors 0.32 0.22 0.00 0.73
## Rent_Final 0.46 0.30 0.04 0.07
## Super.Area Carpert.Area Total.Floors Rent_Final
## No..of.Rooms 0.77 0.73 0.32 0.46
## No..of.Bathrooms 0.52 0.44 0.22 0.30
## No..of.Photos -0.04 -0.05 0.00 0.04
## Floor.no 0.28 0.27 0.73 0.07
## Super.Area 1.00 0.66 0.37 0.58
## Carpert.Area 0.66 1.00 0.32 0.42
## Total.Floors 0.37 0.32 1.00 0.08
## Rent_Final 0.58 0.42 0.08 1.00
## Sample Size
## [1] 1530
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## No..of.Rooms No..of.Bathrooms No..of.Photos Floor.no
## No..of.Rooms 0.00 0.0 0.60 0
## No..of.Bathrooms 0.00 0.0 0.60 0
## No..of.Photos 0.19 0.1 0.00 1
## Floor.no 0.00 0.0 0.68 0
## Super.Area 0.00 0.0 0.11 0
## Carpert.Area 0.00 0.0 0.07 0
## Total.Floors 0.00 0.0 0.97 0
## Rent_Final 0.00 0.0 0.14 0
## Super.Area Carpert.Area Total.Floors Rent_Final
## No..of.Rooms 0.0 0.00 0 0.00
## No..of.Bathrooms 0.0 0.00 0 0.00
## No..of.Photos 0.6 0.47 1 0.60
## Floor.no 0.0 0.00 0 0.03
## Super.Area 0.0 0.00 0 0.00
## Carpert.Area 0.0 0.00 0 0.00
## Total.Floors 0.0 0.00 0 0.03
## Rent_Final 0.0 0.00 0 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
Part 25: Plotting Correlation Matrix
#install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(houseprices.df[,c(6,7,11,13,18,19,22,26)],histogram=TRUE,pch=19)

library(corrgram)
# corrgram
corrgram(houseprices.df[,c(6,7,11,13,18,19,22,26)],
lower.panel = panel.shade,
upper.panel = panel.conf, text.panel = panel.txt,
main = "Corrgram")
