library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(infer)
getwd()
## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/Final_Project/Melbourne_Housing_Data/Data_and_Documentation"
mhd <- read_csv("melb_data.csv")
## Rows: 13580 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Suburb, Address, Type, Method, SellerG, Date, CouncilArea, Regionname
## dbl (13): Rooms, Price, Distance, Postcode, Bedroom2, Bathroom, Car, Landsiz...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(mhd)
## [1] 13580 21
### Code for calculating # of NA
mhd %>%
select(everything()) %>%
summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
##
## # Simple named list: list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
##
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 1 × 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postc…¹ Bedro…²
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0 0 0
## # … with 10 more variables: Bathroom <int>, Car <int>, Landsize <int>,
## # BuildingArea <int>, YearBuilt <int>, CouncilArea <int>, Lattitude <int>,
## # Longtitude <int>, Regionname <int>, Propertycount <int>, and abbreviated
## # variable names ¹Postcode, ²Bedroom2
mhd_bldarea <- filter(mhd, BuildingArea > 0)
mhd_x <- filter(mhd, BuildingArea < 0 )
mhd_bldarea %>%
select(everything()) %>%
summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 × 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postc…¹ Bedro…²
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0 0 0
## # … with 10 more variables: Bathroom <int>, Car <int>, Landsize <int>,
## # BuildingArea <int>, YearBuilt <int>, CouncilArea <int>, Lattitude <int>,
## # Longtitude <int>, Regionname <int>, Propertycount <int>, and abbreviated
## # variable names ¹Postcode, ²Bedroom2
mhd_bld_year <- filter(mhd_bldarea, YearBuilt > 0)
mhd_bld_year %>%
select(everything()) %>%
summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 × 21
## Suburb Address Rooms Type Price Method SellerG Date Distance Postc…¹ Bedro…²
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0 0 0
## # … with 10 more variables: Bathroom <int>, Car <int>, Landsize <int>,
## # BuildingArea <int>, YearBuilt <int>, CouncilArea <int>, Lattitude <int>,
## # Longtitude <int>, Regionname <int>, Propertycount <int>, and abbreviated
## # variable names ¹Postcode, ²Bedroom2
mhd_bld_year$totarea <- mhd_bld_year$BuildingArea +mhd_bld_year$Landsize
mhd_bld_year$price_per_area <- mhd_bld_year$Price/mhd_bld_year$totarea
### The KS test has as its null hypothesis the data are normally distributed. So a large deviation has a low p-value. I have a large deviation from normal. I run this test for each housing type.
mhd_h <- filter(mhd, Type == "h")
ks.test(mhd_h$Price, pnorm)
## Warning in ks.test.default(mhd_h$Price, pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: mhd_h$Price
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
hist(mhd_h$Price)

library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
qqPlot(mhd_h$Price)

## [1] 8123 5048