# --- Importing Libraries ---
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(patchwork)
library(RColorBrewer)
library(glue)
library(hrbrthemes)
library(viridis)
# --- Import Dataframe
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
df_diam <- diamonds
#--- Data set info
n_row = dim(df_diam)[1]
n_col = dim(df_diam)[2]
glue('Total Columns :{n_col}
Total Rows :{n_row}')
## Total Columns :10
## Total Rows :53940
# --- Check na
check_na <- function(col){
sum(is.na(col)) }
apply(df_diam, MARGIN =2, check_na)
## carat cut color clarity depth table price x y z
## 0 0 0 0 0 0 0 0 0 0
#--- see the summarize data
summary(df_diam)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
#--- see the column name
colnames(df_diam)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
ggplot((sample_n(df_diam, 5000)), aes(price,carat, color = clarity))+
geom_point(alpha = 0.6)+theme_few()+
labs( title = "Relationship Between Diamond Price and Carat",
x= 'Price (USD)',
y= "Carat",
subtitle = "Classify By Clarity",
caption = "Source: ggplot package, diamond")
df_dimavg <- diamonds %>% group_by(cut,clarity) %>% summarise(n = n(),
avg_carat = mean(carat),
avg_price = mean(price))
colnames(df_dimavg)
## [1] "cut" "clarity" "n" "avg_carat" "avg_price"
df_dimavg %>% ggplot(aes(avg_carat, avg_price,
group = cut, color = cut))+
geom_point(alpha = 0.5, size = 2)+geom_line(size = .5)+
theme_minimal()+
scale_color_brewer(palette = "Set1")+
labs( title = "Relationship Between Average Diamond Price and Carat",
x= 'Carat',
y= "Price (USD)",
subtitle = "Classify By Cut",
caption = "Source: ggplot package, diamond")
#--- avg_price vs by clarity
df_diam %>% group_by(cut, clarity ) %>%
summarise(avg_price = mean(price)) %>%
ggplot(aes(cut ,avg_price, group = clarity, color = clarity))+
geom_line(aes(linetype = clarity))+
geom_point(alpha = 0.8, size = 2)+
theme_minimal()+
labs( title = "Relationship Between Average Diamond Price and Cut Quality",
x= 'Cut Quality',
y= "Price (USD)",
subtitle = "Classify By Diamond Clarity",
caption = "Source: ggplot package, diamond")
## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.
#--- avg_carat vs col by cut
df_diam %>% group_by(cut, clarity ) %>%
summarise(avg_carat = mean(carat)) %>%
ggplot(aes(clarity ,avg_carat, group = cut, color = cut))+geom_line()+
geom_point(alpha = 0.5, size = 2)+
theme_minimal()+scale_color_brewer(palette = "Set1")+
labs( title = "Relationship Between Average Diamond Carat and Clarity",
x= 'Diamond Clarity',
y= "Average Carat",
subtitle = "Classify By Diamond Cut Quality",
caption = "Source: ggplot package, diamond")
## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.
ggplot(sample_n(df_diam, 5000),
aes(carat, price, color=clarity))+
geom_point(alpha= 0.45)+
labs( title = "Relationship Between Carat and Price of Diamonds ",
x= 'Carat',
y= "Price(USD)",
subtitle = "Classify by cut quality",
caption = "Source: ggplot package, with 5000 samples")+
theme_minimal()+
facet_wrap(~ cut, ncol = 2) +
scale_color_brewer(palette = "RdYlBu")
load library
library(nycflights13)
colnames(weather)
## [1] "origin" "year" "month" "day" "hour"
## [6] "temp" "dewp" "humid" "wind_dir" "wind_speed"
## [11] "wind_gust" "precip" "pressure" "visib" "time_hour"
Replace missing value with mean imputation
weather_1 <- weather
avg_temp <- mean(weather$temp, na.rm = T)
avg_temp # mean temp = 55.26039
## [1] 55.26039
weather_1 <- weather_1 %>% mutate(temp_f = replace_na(temp, avg_temp))
weather_1
Observe the missing data
check_na <- function(col){
sum(is.na(col))}
apply(weather_1, MARGIN =2, check_na)
## origin year month day hour temp dewp
## 0 0 0 0 0 1 1
## humid wind_dir wind_speed wind_gust precip pressure visib
## 1 460 4 20778 0 2729 0
## time_hour temp_f
## 0 0
Plot Heatmap illustrate the New York City temperature in 2013
ggplot(weather_1, aes(day, hour, fill = temp_f))+
geom_tile()+facet_grid(year~month)+
theme_minimal()+
scale_fill_distiller(palette = "RdYlBu")+
guides(fill = guide_colourbar(title = "Temp F",
barwidth = 0.5,
barheight = 15))+
labs( title = "Hourly the New York weather 2013 ",
x= 'Day',
y= "Hour",
subtitle = "Temperature (Fahrenheit)",
caption = "Source: ggplot package, nycflight13")
fght_wtr <- left_join(fg_df1, weather)
d2 <- fght_wtr %>%
mutate(weather = case_when(
(precip >= 0.3 | visib < 3 ) ~ "bad",
(precip >= 0.15 | visib < 5 ) ~ "medium",
TRUE ~ "good"))
n_1kd2 <- sample_n(d2, 15000)
n_1kd2 %>%select(month,temp,hour) %>%
group_by(month,hour) %>% summarise(AVG_TEMP = mean(temp, na.rm = TRUE)) %>%
ggplot(aes(x = hour, y =AVG_TEMP, group = month, color =AVG_TEMP)) +
geom_point(alpha = 0.75, size = 2) + geom_line(color = '#7fcdbb')+
facet_wrap(~ month, ncol = 4)+
theme_minimal()+labs( title = "Monthly Average Temperature in 2013",
color = 'Temerature (F)',
x= 'Hour (24)',
y= "Avg Temp F)",
subtitle = "Using 15000 samples",
caption = "Source: ggplot package") +
scale_color_distiller(palette = "RdYlBu")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# most delay date
dlydt<- fg_df1 %>% group_by(day,month) %>%
summarise(count = n(),
delay_mn = trunc(mean(arr_delay, na.rm = TRUE))) %>%
arrange(desc(delay_mn)) %>%
mutate(h = trunc(delay_mn/60),
mn = trunc(delay_mn %% 60))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
dlydt
ggplot(dlydt, aes(day, month, fill = delay_mn))+
geom_tile()+theme_minimal()+
scale_fill_gradient2(low = "#075AFF",
mid = "#F2F2F2",
high = "#FF0000")+
guides(fill = guide_colourbar(title = "Delay(mins)",
barwidth = 0.5,
barheight = 15))+
labs( title = "Flight Delay in New Yorks 2013 ",
x= 'Day',
y= "Month",
subtitle = "Heatmap of average flight delay in minute",
caption = "Source: ggplot package, nycflight13")
ggplot(weather, aes(temp, fill = month)) +
geom_histogram(color = 'black', bins = 20) +
facet_wrap(~ month, nrow = 4)+theme_minimal()+
scale_fill_viridis(option = "D")+theme(legend.position="none")+
labs(x ='Temperature(F)', y ='',
subtitle = "Histrogram monthly temperature in New York 2013",
caption = "Source: ggplot package, nycflight13, weather")
manuf <- d2 %>% filter(manufacturer %in% c('BOEING','AIRBUS',"AIRBUS INDUSTRIE"))
ggplot((sample_n(manuf, 1000)), aes(dep_delay, arr_delay,
color = manufacturer)) +
geom_point(alpha = 0.45) +
facet_wrap(~ airport_name, ncol = 1)+
theme_minimal()+labs(title = "Flight Delay in New Yorks 2013 ",
color ='Air caft Manufacturer :' ,
x = 'Departure Delay',
y = "Arrival Delay",
subtitle = "Relationship between flight arrival and departure delay",
caption = "Source: ggplot package, nycflight13")+
theme(legend.position='bottom')
ggplot(sample_n(d2, 15000),aes(air_time, airport_name, fill = stat(x)))+
geom_density_ridges_gradient(scale = 0.85)+
scale_fill_viridis_c(option="D", alpha = 0.85)+
theme_minimal()+labs(title = 'Flight Duration NY 2013')+
theme(legend.position="none")+ylab('') +xlab('Flight Duration (mins)')
## Picking joint bandwidth of 13.5