library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at C:/Users/hasnain haider/Desktop/IntrotoR/Week4/Homework3pt2/r_docs
housing<-readr::read_csv("../data/BostonHousing.csv")
## Parsed with column specification:
## cols(
##   CRIM = col_double(),
##   ZN = col_double(),
##   INDUS = col_double(),
##   CHAS = col_double(),
##   NOX = col_double(),
##   RM = col_double(),
##   AGE = col_double(),
##   DIS = col_double(),
##   RAD = col_double(),
##   TAX = col_double(),
##   PTRATIO = col_double(),
##   POVRATE = col_double(),
##   MEDV = col_double(),
##   `CAT. MEDV` = col_double()
## )

Q2 Part b

dim(housing)%>%
  knitr::kable(caption = "Dimension")%>%
  kableExtra::kable_styling(bootstrap_options = "striped")
## Warning in kableExtra::kable_styling(., bootstrap_options = "striped"): Please
## specify format in kable. kableExtra can customize either HTML or LaTeX outputs.
## See https://haozhu233.github.io/kableExtra/ for details.
Dimension
x
506
14

Q2 Part C

typeof(housing)
## [1] "list"
str(housing)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 506 obs. of  14 variables:
##  $ CRIM     : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ ZN       : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ INDUS    : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CHAS     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOX      : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ RM       : num  6.58 6.42 7.18 7 7.15 ...
##  $ AGE      : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ DIS      : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ RAD      : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ TAX      : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ PTRATIO  : num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ POVRATE  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ MEDV     : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##  $ CAT. MEDV: num  0 0 1 1 1 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   CRIM = col_double(),
##   ..   ZN = col_double(),
##   ..   INDUS = col_double(),
##   ..   CHAS = col_double(),
##   ..   NOX = col_double(),
##   ..   RM = col_double(),
##   ..   AGE = col_double(),
##   ..   DIS = col_double(),
##   ..   RAD = col_double(),
##   ..   TAX = col_double(),
##   ..   PTRATIO = col_double(),
##   ..   POVRATE = col_double(),
##   ..   MEDV = col_double(),
##   ..   `CAT. MEDV` = col_double()
##   .. )

Q2 Part D

ggplot(housing)+geom_point(aes(x=MEDV,y=PTRATIO),color="maroon")+xlab("Median")+ylab("Poverty ratio")+ggtitle("Scatter Plot")

Bar Chart

ggplot(housing)+geom_col(aes(x=CHAS,y=MEDV),color="Green")+xlab("Median")+ylab("Charles River")+ggtitle("Bar Chart")

ggplot(filter(housing,MEDV>30))+
  geom_col(aes(x=CHAS,y=MEDV),color="orange")+ylab("Charles River")+xlab("Median")+ggtitle("Bar Chart")

Histogram

ggplot(housing)+
  geom_histogram(aes(x=MEDV),color="Pink")+xlab("Median")+ggtitle("Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Box Plot

ggplot(housing)+geom_boxplot(aes(y=MEDV,x=CHAS,group=CHAS),color="light blue")+ylab("Median")+xlab("No boundry & boundry")+ggtitle("Box Plot")

Scatter Plot

ggplot(housing)+
  geom_point(aes(x=MEDV,y=CRIM),color="Blue")+ylab("CRIM")+xlab("Median")+ggtitle("Scatter Plot")

Adding New Variable

housing<-mutate(housing,LCRIM=log(CRIM))
housing
## # A tibble: 506 x 15
##       CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX PTRATIO POVRATE
##      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>
##  1 0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296    15.3    4.98
##  2 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242    17.8    9.14
##  3 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242    17.8    4.03
##  4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222    18.7    2.94
##  5 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3   222    18.7    5.33
##  6 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222    18.7    5.21
##  7 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311    15.2   12.4 
##  8 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311    15.2   19.2 
##  9 0.211    12.5  7.87     0 0.524  5.63 100    6.08     5   311    15.2   29.9 
## 10 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5   311    15.2   17.1 
## # ... with 496 more rows, and 3 more variables: MEDV <dbl>, `CAT. MEDV` <dbl>,
## #   LCRIM <dbl>

Scatter Plot of LCRIM

ggplot(housing)+
  geom_point(aes(x=MEDV,y=LCRIM),color="Purple")+ylab("Lcrim")+xlab("Median")+ggtitle("Scatter Plot")

As compared to the previous scatter plot, the values plotted in the scatter plot between MEDV and Lcrim are normalised. Values were not scattered in the previous scatter plot