1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.

housing <- read.csv(file = "/Users/Raghu/cuny/week3/Housing.csv")
class(housing)
## [1] "data.frame"
summary(housing)
##        No            price           lotsize         bedrooms    
##  Min.   :  1.0   Min.   : 25000   Min.   : 1650   Min.   :1.000  
##  1st Qu.:137.2   1st Qu.: 49125   1st Qu.: 3600   1st Qu.:2.000  
##  Median :273.5   Median : 62000   Median : 4600   Median :3.000  
##  Mean   :273.5   Mean   : 68122   Mean   : 5150   Mean   :2.965  
##  3rd Qu.:409.8   3rd Qu.: 82000   3rd Qu.: 6360   3rd Qu.:3.000  
##  Max.   :546.0   Max.   :190000   Max.   :16200   Max.   :6.000  
##     bathrms         stories      driveway  recroom   fullbase  gashw    
##  Min.   :1.000   Min.   :1.000   no : 77   no :449   no :355   no :521  
##  1st Qu.:1.000   1st Qu.:1.000   yes:469   yes: 97   yes:191   yes: 25  
##  Median :1.000   Median :2.000                                          
##  Mean   :1.286   Mean   :1.808                                          
##  3rd Qu.:2.000   3rd Qu.:2.000                                          
##  Max.   :4.000   Max.   :4.000                                          
##  airco        garagepl      prefarea 
##  no :373   Min.   :0.0000   no :418  
##  yes:173   1st Qu.:0.0000   yes:128  
##            Median :0.0000            
##            Mean   :0.6923            
##            3rd Qu.:1.0000            
##            Max.   :3.0000
table(housing$driveway)
## 
##  no yes 
##  77 469
table(housing$garagepl)
## 
##   0   1   2   3 
## 300 126 108  12
table(housing$bedrooms, housing$bathrms,housing$driveway )
## , ,  = no
## 
##    
##       1   2   3   4
##   1   1   0   0   0
##   2  18   2   0   0
##   3  34   4   1   0
##   4   6   6   1   0
##   5   2   2   0   0
##   6   0   0   0   0
## 
## , ,  = yes
## 
##    
##       1   2   3   4
##   1   1   0   0   0
##   2 110   6   0   0
##   3 191  68   3   0
##   4  36  42   3   1
##   5   2   2   2   0
##   6   1   1   0   0

2. Data wrangling: Please perform some basic transformations. They will need to make sense but could include column renaming, creating a subset of the data, replacing values, or creating new columns with derived data (for example – if it makes sense you could sum two columns together)

hbedbath <- data.frame(housing$price, housing$bedrooms,housing$bathrms)

hbedbath1 <- hbedbath[housing$price >= 100000, ]
colnames(hbedbath1)<-c("Price","Bed","Bath")

hbedbath2 <- data.frame(housing$price, housing$bedrooms)

#change the column names
colnames(hbedbath2)<-c("Price","Bed")

summary(hbedbath2)
##      Price             Bed       
##  Min.   : 25000   Min.   :1.000  
##  1st Qu.: 49125   1st Qu.:2.000  
##  Median : 62000   Median :3.000  
##  Mean   : 68122   Mean   :2.965  
##  3rd Qu.: 82000   3rd Qu.:3.000  
##  Max.   :190000   Max.   :6.000
#what is the mean and median price of Houses in the City of Windsor

mean(hbedbath2$Price)
## [1] 68121.6
median(hbedbath2$Price)
## [1] 62000
#Provide the number of housing where price is more than a million with 4 bed rooms and 2 stories

x.sub<-subset(housing, housing$price > 100000 & housing$bedrooms == 4 & housing$stories == 2)
x.sub
##      No  price lotsize bedrooms bathrms stories driveway recroom fullbase
## 93   93 163000    7420        4       1       2      yes     yes      yes
## 104 104 132000    3500        4       2       2      yes      no       no
## 162 162 130000    6000        4       1       2      yes      no      yes
## 217 217 138300    6000        4       3       2      yes     yes      yes
## 339 339 141000    8100        4       1       2      yes     yes      yes
## 361 361 130000    6600        4       2       2      yes     yes      yes
## 374 374 122000    6540        4       2       2      yes     yes      yes
## 376 376 133000    6550        4       2       2      yes      no       no
## 419 419 174500    7500        4       2       2      yes      no      yes
## 446 446 104900   11440        4       1       2      yes      no      yes
## 448 448 120000    5500        4       2       2      yes      no      yes
## 486 486 118500    4880        4       2       2      yes      no       no
## 519 519 101000    6240        4       2       2      yes      no       no
##     gashw airco garagepl prefarea
## 93     no   yes        2       no
## 104   yes    no        2       no
## 162    no    no        2       no
## 217   yes    no        2       no
## 339    no   yes        2      yes
## 361    no   yes        1      yes
## 374    no   yes        2      yes
## 376    no   yes        1      yes
## 419    no   yes        3      yes
## 446    no    no        1      yes
## 448    no   yes        1      yes
## 486    no   yes        1      yes
## 519    no   yes        1       no

3.Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2..

#what is the relationship between house price and the number of bed rooms in the city of windsor.

plot(hbedbath2$Price,hbedbath2$Bed, xlab='House Price',ylab='Number of Bed Room' ,main='House Price Vs No. of Bed Rooms', col='red')

#what is the relationship between house price and the number of bed rooms in the city of windsor #. show the price with 3 bed in different colour.

plot(hbedbath1$Price, hbedbath1$Bed, xlab='House Price',ylab='Number of Bed Room' ,main='House Price Vs No. of Bed Rooms', col='red')

points(hbedbath1$Price[hbedbath1$Bed == '3'], hbedbath1$Bath[hbedbath1$Bed  == '3'],pch=15,col='blue')

#Show the histogram of Price Range Vs Number of Houses

histp<-hist(housing$price, freq=TRUE, xlab = "Housing Price Range",  ylab = "No. of Houses", main = "Housing Price Vs Number of Houses", col="lightgreen"  )

curve(dnorm(x, mean=mean(housing$price), sd=sd(housing$price)), add=TRUE, col="darkblue", lwd=2)

histp
## $breaks
##  [1]  20000  40000  60000  80000 100000 120000 140000 160000 180000 200000
## 
## $counts
## [1]  53 205 145  79  37  18   4   4   1
## 
## $density
## [1] 4.853480e-06 1.877289e-05 1.327839e-05 7.234432e-06 3.388278e-06
## [6] 1.648352e-06 3.663004e-07 3.663004e-07 9.157509e-08
## 
## $mids
## [1]  30000  50000  70000  90000 110000 130000 150000 170000 190000
## 
## $xname
## [1] "housing$price"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
# Show the box plot of Price Vs Bed Rooms

boxplot(housing$price ~ housing$bedrooms, data=housing, main=toupper("Housing Price Vs Bedrooms"), font.main=3, cex.main=1.2, xlab="Bed Rooms", ylab="Housing Price", font.lab=3, col="darkblue")

4 Data Visualization

library(ggplot2)

ggplot(housing, aes(x = housing$price, y = housing$bedrooms)) +
  geom_point()

ggplot(housing, aes(x = housing$price, y = housing$bathrms)) +
  geom_point()

ggplot(housing, aes(x = housing$price, y = housing$stories)) +
  geom_point()

library(lattice)
library(plot3D)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(housing, x = ~housing$price, y = ~housing$bedrooms, z = ~housing$bathrms, 
        marker = list(color = mpg, colorscale = c('blue', 'red'), showscale = TRUE)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Housing Price'),
                     yaxis = list(title = 'No. of Bed'),
                     zaxis = list(title = 'No of Bath')),
         annotations = list(
           x = 50000,
           y = 1,
           text = 'Housing Price',
           xref = 'paper',
           yref = 'paper',
           showarrow = FALSE
         ))

p

Conclusion: Based on the above graphs, a person can visualize the population of houses in Windsor city with various price ranges and the number of bed room and bath room in that price range. This helps customer with information to identify the population of houses who is on a specific budget range and also to make effective decision.