This data comes from Kaggle https://www.kaggle.com/datasets/yuliiabulana/canada-housing which contains Property information with features such as price, location, size, number of bedrooms and bathrooms, and other additional features.
Details of the features used in this data exploration and visualization, including:
| Feature | Details |
|---|---|
| Price | Canadian currency (50,000 - 5,000,000) |
| Property Type | Single family, Condo, Townhome, Condo/Townhome, Duplex |
| Square Footage | (0 - 10000) |
| Province | AB (Alberta), BC (British Columbia), MB (Manitoba), NB (New Brunswick), NL (Newfoundlan and Labrador), NS (Nova Scotia), NT (Northwest Territories), ON (Ontario), PE (Prince Edward Island), SK (Saskatchewan), YT (Yukon). |
library(readxl)
library(ggplot2)
library(ggpubr)
library(scales)
library(ggridges)
canada <- read_xlsx("D:/STATISTIKA 23/KULIAH/MK SEMESTER 4/Kumpulan dataset/Canada Housing.xlsx")
head(canada)
## # A tibble: 6 × 23
## City Province Latitude Longitude Price Bedrooms Bathrooms Acreage
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Revelstoke BC 51.0 -118. 839000 3 2 0
## 2 Boswell BC 49.5 -117. 1150000 3 2 0.32
## 3 West Kelowna BC 49.8 -120. 149000 2 1 0
## 4 Kelowna BC 49.8 -119. 1298000 5 4 0.69
## 5 Maple Ridge BC 49.2 -123. 759900 3 2 0
## 6 Calgary AB 50.9 -114. 749800 4 3 0.07
## # ℹ 15 more variables: `Property Type` <chr>, `Square Footage` <dbl>,
## # Garage <chr>, Parking <chr>, Basement <chr>, Exterior <chr>,
## # Fireplace <chr>, Heating <chr>, Flooring <chr>, Roof <chr>,
## # Waterfront <chr>, Sewer <chr>, Pool <chr>, Garden <chr>, Balcony <chr>
nrow(canada)
## [1] 44101
dim(canada)
## [1] 44101 23
summary(canada)
## City Province Latitude Longitude
## Length:44101 Length:44101 Min. :42.05 Min. :-135.86
## Class :character Class :character 1st Qu.:48.46 1st Qu.:-122.88
## Mode :character Mode :character Median :49.22 Median :-119.23
## Mean :49.01 Mean :-106.39
## 3rd Qu.:49.99 3rd Qu.:-102.38
## Max. :65.28 Max. : -52.67
## Price Bedrooms Bathrooms Acreage
## Min. : 50000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 399900 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 675000 Median : 3.000 Median : 2.000 Median : 0.050
## Mean : 937983 Mean : 3.179 Mean : 2.467 Mean : 2.457
## 3rd Qu.:1175000 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 0.170
## Max. :5000000 Max. :21.000 Max. :23.000 Max. :8600.000
## Property Type Square Footage Garage Parking
## Length:44101 Min. : 140 Length:44101 Length:44101
## Class :character 1st Qu.: 968 Class :character Class :character
## Mode :character Median : 1400 Mode :character Mode :character
## Mean : 1704
## 3rd Qu.: 2145
## Max. :10000
## Basement Exterior Fireplace Heating
## Length:44101 Length:44101 Length:44101 Length:44101
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Flooring Roof Waterfront Sewer
## Length:44101 Length:44101 Length:44101 Length:44101
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Pool Garden Balcony
## Length:44101 Length:44101 Length:44101
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
ggplot(data = canada, mapping = aes(x = Price)) +
geom_histogram() +
labs(
title = "Distribusi Harga Rumah",
x = "Harga($CAD)",
y = "Frekuensi"
) +
scale_x_continuous(
limits = c(0, 5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
ggplot(data = canada, mapping = aes(x = Price)) +
geom_histogram(bins = 90, color = 'black') +
labs(
title = "Distribusi Harga Rumah",
x = "Harga($CAD)",
y = "Frekuensi"
) +
scale_x_continuous(
limits = c(0, 5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
ggplot(data = canada, mapping = aes(x = Price)) +
geom_histogram(bins = 90, color = "black", fill = "red") +
scale_x_continuous(
limits = c(0, 5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma) +
scale_y_continuous(labels = comma) +
labs(
title = "Distribusi Harga Rumah",
x = "Harga($CAD)",
y = "Frekuensi")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
ggplot(data = canada, mapping = aes(x = Price)) +
geom_density() +
labs(
title = "Distribusi Harga Rumah",
x = "Harga($CAD)",
y = "Frekuensi"
) +
scale_y_continuous(labels = comma) +
scale_x_continuous(
limits = c(0, 5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma)
ggplot(data = canada, mapping = aes(x = Price)) +
geom_density(fill = "red", alpha = 0.9) +
labs(
title = "Distribusi Harga Rumah",
x = "Harga($CAD)",
y = "Frekuensi"
) +
scale_y_continuous(labels = comma) +
scale_x_continuous(
limits = c(0, 5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma)
ggplot(data = canada, mapping = aes(x = Price, fill = `Property Type`, color = `Property Type`)) +
geom_density(alpha = 0.9) +
labs(
title = "Distribusi Harga Rumah",
x = "Harga($CAD)",
y = "Frekuensi"
) +
scale_x_continuous(
limits = c(0,5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma) +
scale_y_continuous(labels = comma)
ggplot(data = canada, mapping = aes(x = `Square Footage`, y= `Property Type` ,fill = `Property Type`)) +
geom_density_ridges() +
scale_x_continuous(
limits = c(0,10000),
breaks = seq(0, 10000, by=1000)) +
labs(title = "Distribusi Luas Bangunan", x = "Square Footage", y = "Property Type")
## Picking joint bandwidth of 143
ggplot(data = canada, mapping = aes(x = Price)) +
geom_boxplot() +
scale_x_continuous(
limits = c(0,5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma) +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, mapping = aes(x = Price)) +
geom_boxplot() +
scale_x_continuous(
limits = c(0,5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma) +
coord_flip() +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, mapping = aes(x = Price, y = Province)) +
geom_boxplot() +
scale_x_continuous(
limits = c(0,5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma) +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, mapping = aes(x = `Square Footage`, y = Province, fill = Province)) +
geom_boxplot() +
scale_x_continuous(
limits = c(0, 10000),
breaks = seq(0, 10000, by=1000)) +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, mapping = aes(x = `Square Footage`, y = Province, fill = Province)) +
geom_boxplot() +
scale_x_continuous(
limits = c(0, 10000),
breaks = seq(0, 16000, by=2000)) +
theme(legend.position = "none") +
labs(title = "Distribusi Harga Rumah")
##Violin Plot
ggplot(data = canada, mapping = aes(x = Price,y="all")) +
scale_x_continuous(
limits = c(0,5000000),
breaks = seq(0, 5000000, by=700000),
labels = comma) +
geom_violin(fill="green",alpha=0.9) +
geom_boxplot(fill="blue",width=0.2) +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, aes(sample = Price)) +
scale_y_continuous(
breaks = seq(0, 5000000, by=700000),
labels = comma) +
stat_qq(col="blue", cex=0.9) +
stat_qq_line(col="red", lwd=1) +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, mapping = aes(x = `Square Footage`, y = Price)) +
geom_point() +
scale_y_continuous(
breaks = seq(0, 5000000, by=700000),
labels = comma) +
scale_x_continuous(breaks = seq(0, 10000, by=1000)) +
labs(title = "Distribusi Harga Rumah")
ggplot(data = canada, mapping = aes(x = `Square Footage`, y = Price, color = `Property Type`)) +
geom_point() +
scale_y_continuous(
breaks = seq(0, 5000000, by=700000),
labels = comma) +
scale_x_continuous(breaks = seq(0, 10000, by=1000)) +
labs(title = "Distribusi Harga Rumah")
##Line Chart
oracle <- read_excel("D:/STATISTIKA 23/KULIAH/MK SEMESTER 4/Kumpulan dataset/Oracle Stock Dataset.xlsx")
oracle$date <- as.Date(oracle$Date)
head(oracle)
## # A tibble: 6 × 8
## Date `Adj Close` Close High Low Open Volume date
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
## 1 1996-01-02 00:00:00 2.59 3.20 3.21 3.14 3.17 45603000 1996-01-02
## 2 1996-01-03 00:00:00 2.47 3.06 3.21 3.06 3.15 55518750 1996-01-03
## 3 1996-01-04 00:00:00 2.44 3.02 3.08 2.94 3.08 129803850 1996-01-04
## 4 1996-01-05 00:00:00 2.57 3.18 3.19 2.94 2.97 83783700 1996-01-05
## 5 1996-01-08 00:00:00 2.53 3.13 3.23 3.12 3.22 12594150 1996-01-08
## 6 1996-01-09 00:00:00 2.41 2.98 3.14 2.93 3.14 96068700 1996-01-09
str(oracle)
## tibble [7,260 × 8] (S3: tbl_df/tbl/data.frame)
## $ Date : POSIXct[1:7260], format: "1996-01-02" "1996-01-03" ...
## $ Adj Close: num [1:7260] 2.59 2.47 2.44 2.57 2.53 ...
## $ Close : num [1:7260] 3.2 3.06 3.02 3.18 3.13 ...
## $ High : num [1:7260] 3.21 3.21 3.08 3.19 3.23 ...
## $ Low : num [1:7260] 3.14 3.06 2.94 2.94 3.12 ...
## $ Open : num [1:7260] 3.17 3.15 3.08 2.97 3.22 ...
## $ Volume : num [1:7260] 4.56e+07 5.55e+07 1.30e+08 8.38e+07 1.26e+07 ...
## $ date : Date[1:7260], format: "1996-01-02" "1996-01-03" ...
ggplot(data = oracle, aes(x = Date, y = Open)) +
geom_line() +
labs(
title = "Distribusi",
x = "Tahun",
y = "Harga Pada Saat Pembukaan Pasar"
)
ggplot(data = oracle, aes(x = Date,y = Open)) +
geom_line(lwd=1.2, col="darkblue") +
geom_area(fill="blue", alpha=0.3) +
labs(
title = "Distribusi",
x = "Tahun",
y = "Harga Pada Saat Pembukaan Pasar"
)
ggplot(data = oracle, aes(x=Date)) +
geom_line(aes(y=Open), lwd=1.5, col="red") +
geom_line(aes(y=Close), lwd=1.5, col="lightblue") +
labs(title="Distribusi",x="tanggal", y="Harga Pada Saat Pembukaan Pasar")
ggplot(data = oracle, aes(x=Date)) +
geom_line(aes(y=Open), lwd=1.2, col="red") +
geom_line(aes(y=Close), lwd=1.2, col="lightblue") +
xlim(min(oracle$Date),max(oracle$Date)+100) +
labs(title="Distribusi",x="tanggal", y="Harga Pada Saat Pembukaan Pasar")