Part 1:reading external data and storing into a dataframe called “houseprices.df”

library(data.table)
houseprices.df <- read.csv("DATA_Unique_V5.csv")
dt <- fread(input="DATA_Unique_V5.csv",stringsAsFactors=TRUE)
attach(houseprices.df)

Part2: Data Dimensions

#Display the data dimensions
dim(houseprices.df)
## [1] 1910   26

Part 3: Column names

# Display the column names
colnames(houseprices.df)
##  [1] "Owner"               "Area"                "MacroArea"          
##  [4] "dataId"              "Property.Link"       "No..of.Rooms"       
##  [7] "No..of.Bathrooms"    "Address.of.Locality" "Builder.Name"       
## [10] "Rent"                "No..of.Photos"       "Furnishing.status"  
## [13] "Floor.no"            "Verified.Status"     "Price.in.Integer"   
## [16] "Latitude"            "Longitude"           "Super.Area"         
## [19] "Carpert.Area"        "Car.Parking"         "Floor.Details"      
## [22] "Total.Floors"        "Flooring"            "Facing"             
## [25] "Tenants"             "Rent_Final"

Part 4: Descriptive Statistics of the dataframe

library(psych)
describe(houseprices.df)
##                      vars    n        mean         sd      median
## Owner*                  1 1910        4.14       1.94        4.00
## Area*                   2 1910       13.87       7.10       14.00
## MacroArea*              3 1910        3.33       1.95        3.00
## dataId                  4 1910 42430243.52 5428863.90 44558085.00
## Property.Link*          5 1910      955.50     551.51      955.50
## No..of.Rooms            6 1910        1.68       0.47        2.00
## No..of.Bathrooms        7 1910        1.86       0.41        2.00
## Address.of.Locality*    8 1910      113.89      65.49      107.00
## Builder.Name*           9 1910       66.75      85.84        1.00
## Rent*                  10 1910       88.87      43.47       88.00
## No..of.Photos*         11 1910       18.31      13.72       13.00
## Furnishing.status*     12 1910        2.16       0.75        2.00
## Floor.no               13 1702        6.55       5.60        5.00
## Verified.Status*       14 1910       10.05      22.62        1.00
## Price.in.Integer*      15 1910      130.40      63.50      128.00
## Latitude               16 1910       14.92      10.61       19.11
## Longitude              17 1910       53.41      32.02       72.84
## Super.Area             18 1857      911.03     272.29      950.00
## Carpert.Area*          19 1732      160.07      75.47      166.00
## Car.Parking*           20 1910        8.18       7.37        3.00
## Floor.Details*         21 1910      279.80     169.49      329.00
## Total.Floors           22 1702       13.26       9.25       11.00
## Flooring*              23 1910       57.19      43.67       71.00
## Facing*                24 1910        3.25       2.49        2.00
## Tenants*               25 1910        1.71       0.81        1.00
## Rent_Final             26 1910    44245.03   26914.36    40000.00
##                          trimmed        mad     min         max
## Owner*                      4.18       1.48       1        7.00
## Area*                      13.87       8.90       1       26.00
## MacroArea*                  3.28       2.97       1        6.00
## dataId               43788127.47 1305071.99 6743602 45622317.00
## Property.Link*            955.50     707.94       1     1910.00
## No..of.Rooms                1.73       0.00       1        2.00
## No..of.Bathrooms            1.92       0.00       1        5.00
## Address.of.Locality*      112.73      84.51       1      228.00
## Builder.Name*              52.64       0.00       1      268.00
## Rent*                      89.27      47.44       1      178.00
## No..of.Photos*             18.01      16.31       1       38.00
## Furnishing.status*          2.20       1.48       1        3.00
## Floor.no                    5.74       4.45      -2       44.00
## Verified.Status*            3.42       0.00       1       99.00
## Price.in.Integer*         131.36      71.16       1      253.00
## Latitude                   15.38       0.12       0       72.92
## Longitude                  57.65       0.07       0       72.97
## Super.Area                909.06     303.93      60     2079.00
## Carpert.Area*             162.13      90.44       1      293.00
## Car.Parking*                7.98       2.97       1       17.00
## Floor.Details*            285.63     173.46       1      547.00
## Total.Floors               11.90       5.93       1       63.00
## Flooring*                  58.33      40.03       1      116.00
## Facing*                     2.82       1.48       1        9.00
## Tenants*                    1.64       0.00       1        3.00
## Rent_Final              40985.37   20756.40       0   225000.00
##                            range  skew kurtosis        se
## Owner*                      6.00 -0.22    -0.98      0.04
## Area*                      25.00  0.00    -1.07      0.16
## MacroArea*                  5.00  0.10    -1.51      0.04
## dataId               38878715.00 -2.94     9.35 124220.21
## Property.Link*           1909.00  0.00    -1.20     12.62
## No..of.Rooms                1.00 -0.78    -1.39      0.01
## No..of.Bathrooms            4.00 -0.78     3.11      0.01
## Address.of.Locality*      227.00  0.15    -1.17      1.50
## Builder.Name*             267.00  0.98    -0.45      1.96
## Rent*                     177.00 -0.02    -0.73      0.99
## No..of.Photos*             37.00  0.15    -1.71      0.31
## Furnishing.status*          2.00 -0.27    -1.20      0.02
## Floor.no                   46.00  1.91     5.86      0.14
## Verified.Status*           98.00  2.44     4.70      0.52
## Price.in.Integer*         252.00 -0.05    -0.82      1.45
## Latitude                   72.92  1.39     9.23      0.24
## Longitude                  72.97 -1.04    -0.89      0.73
## Super.Area               2019.00  0.12     0.05      6.32
## Carpert.Area*             292.00 -0.24    -1.09      1.81
## Car.Parking*               16.00  0.32    -1.83      0.17
## Floor.Details*            546.00 -0.40    -1.16      3.88
## Total.Floors               62.00  1.56     2.99      0.22
## Flooring*                 115.00 -0.31    -1.68      1.00
## Facing*                     8.00  1.42     0.70      0.06
## Tenants*                    2.00  0.57    -1.24      0.02
## Rent_Final             225000.00  1.56     4.11    615.84

Part 5: Data structures

# Display the data structures
str(houseprices.df)
## 'data.frame':    1910 obs. of  26 variables:
##  $ Owner              : Factor w/ 7 levels "Apoorv","dhonde",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Area               : Factor w/ 26 levels "andheri","Andheri",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ MacroArea          : Factor w/ 6 levels "Central","East",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ dataId             : int  44414015 44653729 38459903 44075693 45342967 45196703 44470003 45052161 43507931 45464203 ...
##  $ Property.Link      : Factor w/ 1910 levels "https://www.magicbricks.com/propertyDetails/1-BHK-1000-Sq-ft-Builder-Floor-Apartment-FOR-Rent-Santacruz-West-in"| __truncated__,..: 528 1106 915 1341 1380 1067 897 1105 830 752 ...
##  $ No..of.Rooms       : int  1 2 2 2 2 2 2 2 2 2 ...
##  $ No..of.Bathrooms   : int  1 2 2 2 2 2 2 3 2 2 ...
##  $ Address.of.Locality: Factor w/ 228 levels "4 Bunglows","Aarey Milk Colony",..: 122 9 29 9 65 122 9 8 136 9 ...
##  $ Builder.Name       : Factor w/ 268 levels "","A M Enterprises",..: 1 81 257 11 11 58 1 1 1 44 ...
##  $ Rent               : Factor w/ 178 levels "","1","1 Lac",..: 92 129 111 169 177 124 152 124 88 141 ...
##  $ No..of.Photos      : Factor w/ 38 levels "0","1","10","11",..: 36 35 6 38 38 38 3 8 9 4 ...
##  $ Furnishing.status  : Factor w/ 3 levels "Furnished","Semi-Furnished",..: 1 2 2 2 2 3 3 2 3 3 ...
##  $ Floor.no           : int  1 5 NA 9 25 19 1 7 8 12 ...
##  $ Verified.Status    : Factor w/ 99 levels "","100000","105000",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Price.in.Integer   : Factor w/ 253 levels "0","1 Lac","1.1 Lac",..: 136 190 162 243 252 183 217 183 128 208 ...
##  $ Latitude           : num  0 19.1 19.1 0 19.1 ...
##  $ Longitude          : num  0 72.8 72.9 0 72.8 ...
##  $ Super.Area         : int  700 1200 1100 1361 1499 1160 1100 1200 1050 1040 ...
##  $ Carpert.Area       : Factor w/ 293 levels "1000","1015",..: 135 221 234 260 292 163 260 243 256 193 ...
##  $ Car.Parking        : Factor w/ 17 levels "1 Covered","1 Covered, 1 Open",..: 3 1 3 6 6 1 3 1 17 3 ...
##  $ Floor.Details      : Factor w/ 547 levels "","-1 (11 )",..: 40 392 1 529 287 220 40 462 488 105 ...
##  $ Total.Floors       : int  7 13 NA 31 29 22 7 16 10 18 ...
##  $ Flooring           : Factor w/ 116 levels "","Ceramic Tiles",..: 98 32 98 98 75 98 115 104 98 1 ...
##  $ Facing             : Factor w/ 9 levels "","East","North",..: 3 4 2 1 2 4 2 2 2 1 ...
##  $ Tenants            : Factor w/ 3 levels "","Bachelors",..: 1 1 2 3 3 3 3 1 1 1 ...
##  $ Rent_Final         : int  42000 58000 48100 90000 98000 55000 72000 55000 40000 65000 ...

part 6: Histogram/Density Plot/Quantiles/Normality plots of Rent

# plotting histogram
hist(houseprices.df$Rent_Final,main = "Histogram of variable Rent",
xlab = "Rent",col = c("gray"))

plot(density(Rent_Final), frame = TRUE, 
     main = "Density Plot of Rent") 
polygon(density(Rent_Final), col = "black")

# normality plot using qqnorm and qqline
qqnorm(Rent_Final)
qqline(Rent_Final)

# quantiles
quantile(Rent_Final)
##     0%    25%    50%    75%   100% 
##      0  26125  40000  55000 225000

part 7: boxplot of rent

boxplot(houseprices.df$Rent_Final,width = 0.5,
horizontal = TRUE,main = "boxplot for variable Rent",
xlab = "Rent",col = c("lightblue"))

part 8: Barplot of Frequency of data by macro region and region

counts1 <- table(MacroArea)
barplot(counts1, main="MacroArea",
   xlab="")

counts2 <- table(Area)
barplot(counts2, main="Area",
   xlab="")

part 9: Mean, std dev and other rent descriptives

describe(Rent_Final)
##    vars    n     mean       sd median  trimmed     mad min    max  range
## X1    1 1910 44245.03 26914.36  40000 40985.37 20756.4   0 225000 225000
##    skew kurtosis     se
## X1 1.56     4.11 615.84

part 10 Boxplot for Rent based on macro areas

boxplot(Rent_Final ~ MacroArea , data = houseprices.df,
                main = "Boxplot for rent grouped by MacroArea",col=(c("Blue","red","gray","black")))

part 11 Boxplot of rent split by no. of rooms

boxplot(Rent_Final~No..of.Rooms, xlab = "No. of rooms", ylab = "Rent", main= "Boxplot of rent split by bedrooms")

part 12 Boxplot of rent split by no. of Bathrooms

boxplot(Rent_Final~No..of.Bathrooms, xlab = "No. of Bathrooms", ylab = "Rent", main= "Boxplot of rent split by no. of Bathrooms")

part 13 Trend of (Price/SuperArea) with macroregion

boxplot((Rent_Final/Super.Area)~MacroArea, xlab = "MacroRegion", ylab = "Rent/Super Area", main= "Boxplot of (Rent/Superarea) by macroregion")

Part 14: Scatter Plot of Area and MacroArea with Rent

# loading the package
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
options(scipen = 10000)
# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = Area, y = Rent_Final)) +
      geom_point()

p + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Area") + labs(y = "Rent") + labs(title="Scatterplot of Area and Rent")

# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = MacroArea, y = Rent_Final)) +
      geom_point() 
p + labs(x = "Macro Area") + labs(y = "Rent") + labs(title="Scatterplot of Macro Area and Rent")

Part 15: Scatter Plot of (Rent/Area) and MacroArea with Rent

Rent_per_sqFtarea = Rent_Final/Super.Area

# loading the package
library(ggplot2)
options(scipen = 10000)
# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = Area, y = Rent_per_sqFtarea)) +
      geom_point()

p + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Area") + labs(y = "Rent per sq ft") + labs(title="Scatterplot of Area and Rent per sq ft")
## Warning: Removed 53 rows containing missing values (geom_point).

# plotting scatter plot using ggplot2
p <- ggplot(houseprices.df, aes(x = MacroArea, y = Rent_per_sqFtarea)) +
      geom_point() 
p + labs(x = "Macro Area") + labs(y = "Rent per sq ft") + labs(title="Scatterplot of Macro Area and Rent per sq ft")
## Warning: Removed 53 rows containing missing values (geom_point).

Part 16: Scatter Plot of No of photos and Rent

No..of.Photos = as.numeric(No..of.Photos)
plot(No..of.Photos , Rent_Final,main = "Plot of No of Photos and Rent",xlab = "No of Photos", ylab = "Rent" )

Part 17,18, 19: Scatter Plot of Total Floor, Floor No and Floor/Total and Rent

plot(Total.Floors , Rent_Final,main = "Plot of Total Floors and Rent",xlab = "Floor No", ylab = "Rent" )

Floor.no = as.numeric(Floor.no)
plot(Floor.no , Rent_Final,main = "Plot of Floor No and Rent",xlab = "Floor No", ylab = "Rent" )

Floor.Location = Floor.no / Total.Floors
plot(Floor.Location , Rent_Final,main = "Plot of (Floor No/Total Floors) and Rent",xlab = "(Floor No / Total Floors)", ylab = "Rent" )

Part 20 and 21: Scatter Plot of Super Area, Carpet Area with Rent

plot(Super.Area, Carpert.Area)

plot(Super.Area, Rent_Final,main = "Plot of Super Area and Rent",xlab = "Super Area", ylab = "Rent" )

Carpert.Area = as.numeric(Carpert.Area)
plot(Carpert.Area, Rent_Final,main = "Plot of Carpet Area and Rent",xlab = "Carpet Area", ylab = "Rent" )

Part 22: Rent Contingency tables with respect to no of rooms & bathrooms

aggregate(houseprices.df$Rent_Final,
          by = list(houseprices.df$No..of.Rooms),mean)
##   Group.1        x
## 1       1 25960.70
## 2       2 52762.75
aggregate(houseprices.df$Rent_Final,
          by = list(houseprices.df$No..of.Bathrooms),mean)
##   Group.1        x
## 1       1 27314.48
## 2       2 47042.26
## 3       3 68166.67
## 4       5 75000.00
#library(dplyr)
#group <- group_by(houseprices.df,`No. of Rooms`,`No. of Bathrooms`)
#summarise(group, count = n(),
#mean = mean(Rent_Final, na.rm = TRUE),
#sd   = sd(Rent_Final, na.rm = TRUE)) 

Part 23

boxplot(Rent_Final ~ Facing , data = houseprices.df,
                main = "Boxplot for rent grouped by Facing direction",col=(c("Blue","red","green","yellow")),las=2)

Part 24: Correlation Matrix for all the Continuous Variable

houseprices.df[,c(6,7,11,13,18,19,22,26)] <- sapply(houseprices.df[,c(6,7,11,13,18,19,22,26)],as.numeric)
library(psych)
corr.test(houseprices.df[,c(6,7,11,13,18,19,22,26)],use="complete")
## Call:corr.test(x = houseprices.df[, c(6, 7, 11, 13, 18, 19, 22, 26)], 
##     use = "complete")
## Correlation matrix 
##                  No..of.Rooms No..of.Bathrooms No..of.Photos Floor.no
## No..of.Rooms             1.00             0.56         -0.03     0.26
## No..of.Bathrooms         0.56             1.00         -0.04     0.20
## No..of.Photos           -0.03            -0.04          1.00     0.01
## Floor.no                 0.26             0.20          0.01     1.00
## Super.Area               0.77             0.52         -0.04     0.28
## Carpert.Area             0.73             0.44         -0.05     0.27
## Total.Floors             0.32             0.22          0.00     0.73
## Rent_Final               0.46             0.30          0.04     0.07
##                  Super.Area Carpert.Area Total.Floors Rent_Final
## No..of.Rooms           0.77         0.73         0.32       0.46
## No..of.Bathrooms       0.52         0.44         0.22       0.30
## No..of.Photos         -0.04        -0.05         0.00       0.04
## Floor.no               0.28         0.27         0.73       0.07
## Super.Area             1.00         0.66         0.37       0.58
## Carpert.Area           0.66         1.00         0.32       0.42
## Total.Floors           0.37         0.32         1.00       0.08
## Rent_Final             0.58         0.42         0.08       1.00
## Sample Size 
## [1] 1530
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                  No..of.Rooms No..of.Bathrooms No..of.Photos Floor.no
## No..of.Rooms             0.00              0.0          0.60        0
## No..of.Bathrooms         0.00              0.0          0.60        0
## No..of.Photos            0.19              0.1          0.00        1
## Floor.no                 0.00              0.0          0.68        0
## Super.Area               0.00              0.0          0.11        0
## Carpert.Area             0.00              0.0          0.07        0
## Total.Floors             0.00              0.0          0.97        0
## Rent_Final               0.00              0.0          0.14        0
##                  Super.Area Carpert.Area Total.Floors Rent_Final
## No..of.Rooms            0.0         0.00            0       0.00
## No..of.Bathrooms        0.0         0.00            0       0.00
## No..of.Photos           0.6         0.47            1       0.60
## Floor.no                0.0         0.00            0       0.03
## Super.Area              0.0         0.00            0       0.00
## Carpert.Area            0.0         0.00            0       0.00
## Total.Floors            0.0         0.00            0       0.03
## Rent_Final              0.0         0.00            0       0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

Part 25: Plotting Correlation Matrix

#install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(houseprices.df[,c(6,7,11,13,18,19,22,26)],histogram=TRUE,pch=19)

library(corrgram)
# corrgram
corrgram(houseprices.df[,c(6,7,11,13,18,19,22,26)],
lower.panel = panel.shade,
upper.panel = panel.conf, text.panel = panel.txt,
main = "Corrgram")