knitr::opts_chunk$set(echo = TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

R Markdown

Libraries

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3

Reading dataset from any custom path

housing.raw <- read.csv(choose.files(), header = TRUE, sep = ",")

Data heads

head(housing.raw)
##      RM LSTAT PTRATIO   MEDV
## 1 6.575  4.98    15.3 504000
## 2 6.421  9.14    17.8 453600
## 3 7.185  4.03    17.8 728700
## 4 6.998  2.94    18.7 701400
## 5 7.147  5.33    18.7 760200
## 6 6.430  5.21    18.7 602700

Data tails

tail(housing.raw)
##        RM LSTAT PTRATIO   MEDV
## 484 6.027 14.33    19.2 352800
## 485 6.593  9.67    21.0 470400
## 486 6.120  9.08    21.0 432600
## 487 6.976  5.64    21.0 501900
## 488 6.794  6.48    21.0 462000
## 489 6.030  7.88    21.0 249900

Size of dataset

dim(housing.raw)
## [1] 489   4

No of rows of dataset

nrow(housing.raw)
## [1] 489

No of column of dataset

ncol(housing.raw)
## [1] 4

Structure of dataset

str(housing.raw)
## 'data.frame':    489 obs. of  4 variables:
##  $ RM     : num  6.58 6.42 7.18 7 7.15 ...
##  $ LSTAT  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ PTRATIO: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ MEDV   : num  504000 453600 728700 701400 760200 ...

Names of variables in dataset

names(housing.raw)
## [1] "RM"      "LSTAT"   "PTRATIO" "MEDV"

Variable datatype in the dataset

sapply(housing.raw, typeof)
##       RM    LSTAT  PTRATIO     MEDV 
## "double" "double" "double" "double"

Summary of dataset

summary(housing.raw)
##        RM            LSTAT          PTRATIO           MEDV        
##  Min.   :3.561   Min.   : 1.98   Min.   :12.60   Min.   : 105000  
##  1st Qu.:5.880   1st Qu.: 7.37   1st Qu.:17.40   1st Qu.: 350700  
##  Median :6.185   Median :11.69   Median :19.10   Median : 438900  
##  Mean   :6.240   Mean   :12.94   Mean   :18.52   Mean   : 454343  
##  3rd Qu.:6.575   3rd Qu.:17.12   3rd Qu.:20.20   3rd Qu.: 518700  
##  Max.   :8.398   Max.   :37.97   Max.   :22.00   Max.   :1024800

Correlation among variables

cor(housing.raw)
##                 RM      LSTAT    PTRATIO       MEDV
## RM       1.0000000 -0.6120332 -0.3045593  0.6972092
## LSTAT   -0.6120332  1.0000000  0.3604446 -0.7606701
## PTRATIO -0.3045593  0.3604446  1.0000000 -0.5190335
## MEDV     0.6972092 -0.7606701 -0.5190335  1.0000000

Plotting Correlation

corrplot(cor(housing.raw))

# Data Visulization

pairs(~ RM + LSTAT + PTRATIO + MEDV, data = housing.raw, main = "Boston Data")

# Scatter Plot

plot(RM~LSTAT, housing.raw)

# Boxplot of all variables

par(mfrow = c(1, 4))
boxplot(housing.raw$RM, main='RM',col='Sky Blue')
boxplot(housing.raw$LSTAT, main='LSTAT',col='Sky Blue')
boxplot(housing.raw$PTRATIO, main='PTRATIO',col='Sky Blue')
boxplot(housing.raw$MEDV, main='MEDV',col='Sky Blue')

# Barchart of MDEV

ggplot(housing.raw) +
  geom_bar(aes(x = MEDV), fill = 'blue')

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.