library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(infer)
getwd()
## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/Final_Project/Melbourne_Housing_Data/Data_and_Documentation"
mhd <- read_csv("melb_data.csv")
## Rows: 13580 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): Suburb, Address, Type, Method, SellerG, Date, CouncilArea, Regionname
## dbl (13): Rooms, Price, Distance, Postcode, Bedroom2, Bathroom, Car, Landsiz...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(mhd)
## [1] 13580    21
### Code for calculating # of NA 
mhd %>%
  select(everything()) %>% 
  summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 1 × 21
##   Suburb Address Rooms  Type Price Method SellerG  Date Distance Postc…¹ Bedro…²
##    <int>   <int> <int> <int> <int>  <int>   <int> <int>    <int>   <int>   <int>
## 1      0       0     0     0     0      0       0     0        0       0       0
## # … with 10 more variables: Bathroom <int>, Car <int>, Landsize <int>,
## #   BuildingArea <int>, YearBuilt <int>, CouncilArea <int>, Lattitude <int>,
## #   Longtitude <int>, Regionname <int>, Propertycount <int>, and abbreviated
## #   variable names ¹​Postcode, ²​Bedroom2
mhd_bldarea <- filter(mhd, BuildingArea > 0)
mhd_x <-  filter(mhd, BuildingArea < 0 )
mhd_bldarea %>%
  select(everything()) %>% 
  summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 × 21
##   Suburb Address Rooms  Type Price Method SellerG  Date Distance Postc…¹ Bedro…²
##    <int>   <int> <int> <int> <int>  <int>   <int> <int>    <int>   <int>   <int>
## 1      0       0     0     0     0      0       0     0        0       0       0
## # … with 10 more variables: Bathroom <int>, Car <int>, Landsize <int>,
## #   BuildingArea <int>, YearBuilt <int>, CouncilArea <int>, Lattitude <int>,
## #   Longtitude <int>, Regionname <int>, Propertycount <int>, and abbreviated
## #   variable names ¹​Postcode, ²​Bedroom2
mhd_bld_year <- filter(mhd_bldarea, YearBuilt > 0)
mhd_bld_year %>%
  select(everything()) %>%
  summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 × 21
##   Suburb Address Rooms  Type Price Method SellerG  Date Distance Postc…¹ Bedro…²
##    <int>   <int> <int> <int> <int>  <int>   <int> <int>    <int>   <int>   <int>
## 1      0       0     0     0     0      0       0     0        0       0       0
## # … with 10 more variables: Bathroom <int>, Car <int>, Landsize <int>,
## #   BuildingArea <int>, YearBuilt <int>, CouncilArea <int>, Lattitude <int>,
## #   Longtitude <int>, Regionname <int>, Propertycount <int>, and abbreviated
## #   variable names ¹​Postcode, ²​Bedroom2
mhd_bld_year$totarea <- mhd_bld_year$BuildingArea +mhd_bld_year$Landsize
mhd_bld_year$price_per_area <- mhd_bld_year$Price/mhd_bld_year$totarea
### The KS test has as its null hypothesis the data are normally distributed. So a large deviation has a low p-value. I have a large deviation from normal. I run this test for each housing type. 
mhd_h <- filter(mhd, Type == "h")
ks.test(mhd_h$Price, pnorm)
## Warning in ks.test.default(mhd_h$Price, pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  mhd_h$Price
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
hist(mhd_h$Price)

library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
qqPlot(mhd_h$Price)

## [1] 8123 5048