Exploratory Data Analysis And Visualization R

Using Diamonds Dataset

Import library

# --- Importing Libraries ---
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(patchwork)
library(RColorBrewer)
library(glue)
library(hrbrthemes)
library(viridis)

# --- Import Dataframe 
glimpse(diamonds)

## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

df_diam <- diamonds

#--- Data set info

n_row = dim(df_diam)[1]
n_col = dim(df_diam)[2]
glue('Total Columns :{n_col}
      Total Rows :{n_row}')

## Total Columns :10
## Total Rows :53940

Checking missing vlue in the data set

# --- Check na
check_na <- function(col){
  sum(is.na(col)) }

apply(df_diam, MARGIN =2, check_na)

##   carat     cut   color clarity   depth   table   price       x       y       z 
##       0       0       0       0       0       0       0       0       0       0

#--- see the summarize data
summary(df_diam)

##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
##

#--- see the column name
colnames(df_diam)

##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"

let’s explore some data using ggplot2

ggplot((sample_n(df_diam, 5000)), aes(price,carat, color = clarity))+
              geom_point(alpha = 0.6)+theme_few()+
             labs( title = "Relationship Between Diamond Price and Carat",
             x= 'Price (USD)',
             y= "Carat",
             subtitle = "Classify By Clarity",
             caption = "Source: ggplot package, diamond")

df_dimavg <- diamonds %>% group_by(cut,clarity) %>% summarise(n = n(),
                                         avg_carat = mean(carat),
                                         avg_price = mean(price))
colnames(df_dimavg)

## [1] "cut"       "clarity"   "n"         "avg_carat" "avg_price"

df_dimavg %>%  ggplot(aes(avg_carat, avg_price,
                          group = cut, color = cut))+
               geom_point(alpha = 0.5, size = 2)+geom_line(size = .5)+
               theme_minimal()+
               scale_color_brewer(palette = "Set1")+
             labs( title = "Relationship Between Average Diamond Price and Carat",
             x= 'Carat',
             y= "Price (USD)",
             subtitle = "Classify By Cut",
             caption = "Source: ggplot package, diamond")

#--- avg_price vs by clarity
df_diam %>% group_by(cut, clarity ) %>% 
                 summarise(avg_price = mean(price)) %>% 
                 ggplot(aes(cut ,avg_price, group = clarity, color = clarity))+
                 geom_line(aes(linetype = clarity))+
                 geom_point(alpha = 0.8, size = 2)+
                 theme_minimal()+
                 labs( title = "Relationship Between Average Diamond Price and Cut Quality",
                      x= 'Cut Quality',
                      y= "Price (USD)",
                      subtitle = "Classify By Diamond Clarity",
                      caption = "Source: ggplot package, diamond")

## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.

#--- avg_carat vs col by cut
df_diam %>% group_by(cut, clarity ) %>% 
                summarise(avg_carat = mean(carat)) %>% 
                ggplot(aes(clarity ,avg_carat, group = cut, color = cut))+geom_line()+
                geom_point(alpha = 0.5, size = 2)+
                theme_minimal()+scale_color_brewer(palette = "Set1")+
                 labs( title = "Relationship Between Average Diamond Carat and Clarity",
                      x= 'Diamond Clarity',
                      y= "Average Carat",
                      subtitle = "Classify By Diamond Cut Quality",
                      caption = "Source: ggplot package, diamond")

## `summarise()` has grouped output by 'cut'. You can override using the `.groups`
## argument.

ggplot(sample_n(df_diam, 5000), 
       aes(carat, price, color=clarity))+
  geom_point(alpha= 0.45)+
  labs( title = "Relationship Between Carat and Price of Diamonds ",
        x= 'Carat',
        y= "Price(USD)",
        subtitle = "Classify by cut quality",
        caption = "Source: ggplot package, with 5000 samples")+ 
  theme_minimal()+
  facet_wrap(~ cut, ncol = 2) +
  scale_color_brewer(palette = "RdYlBu")

Using nycflight13 Dataset

load library

library(nycflights13)

colnames(weather)

##  [1] "origin"     "year"       "month"      "day"        "hour"      
##  [6] "temp"       "dewp"       "humid"      "wind_dir"   "wind_speed"
## [11] "wind_gust"  "precip"     "pressure"   "visib"      "time_hour"

Replace missing value with mean imputation

weather_1 <- weather
avg_temp <- mean(weather$temp, na.rm = T) 
avg_temp # mean temp = 55.26039

## [1] 55.26039

weather_1 <- weather_1 %>% mutate(temp_f = replace_na(temp, avg_temp))
weather_1

Observe the missing data

check_na <- function(col){
  sum(is.na(col))}

apply(weather_1, MARGIN =2, check_na)

##     origin       year      month        day       hour       temp       dewp 
##          0          0          0          0          0          1          1 
##      humid   wind_dir wind_speed  wind_gust     precip   pressure      visib 
##          1        460          4      20778          0       2729          0 
##  time_hour     temp_f 
##          0          0

Plot Heatmap illustrate the New York City temperature in 2013

ggplot(weather_1, aes(day, hour, fill = temp_f))+
     geom_tile()+facet_grid(year~month)+
     theme_minimal()+
     scale_fill_distiller(palette = "RdYlBu")+
     guides(fill = guide_colourbar(title = "Temp F",
                                   barwidth = 0.5,
                                   barheight = 15))+
     labs( title = "Hourly the New York weather 2013 ",
        x= 'Day',
        y= "Hour",
        subtitle = "Temperature (Fahrenheit)",
        caption = "Source: ggplot package, nycflight13")

fght_wtr <- left_join(fg_df1, weather)
d2 <- fght_wtr %>%  
  mutate(weather = case_when(
    (precip >= 0.3 | visib < 3 )  ~ "bad",
    (precip >= 0.15 | visib < 5 ) ~ "medium",
    TRUE ~ "good"))

n_1kd2 <- sample_n(d2, 15000)

n_1kd2 %>%select(month,temp,hour) %>%  
      group_by(month,hour) %>% summarise(AVG_TEMP = mean(temp, na.rm = TRUE)) %>% 
      ggplot(aes(x = hour, y =AVG_TEMP, group = month, color =AVG_TEMP)) + 
      geom_point(alpha = 0.75, size = 2) + geom_line(color = '#7fcdbb')+
      facet_wrap(~ month, ncol = 4)+
      theme_minimal()+labs( title = "Monthly Average Temperature in 2013",
                           color = 'Temerature (F)',
                           x= 'Hour (24)',
                           y= "Avg Temp F)",
                           subtitle = "Using 15000 samples",
                           caption = "Source: ggplot package") +
                      scale_color_distiller(palette = "RdYlBu")

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

# most delay date
dlydt<- fg_df1 %>% group_by(day,month) %>% 
            summarise(count = n(),
                      delay_mn =  trunc(mean(arr_delay, na.rm = TRUE))) %>%
            arrange(desc(delay_mn)) %>% 
            mutate(h = trunc(delay_mn/60),
                   mn = trunc(delay_mn %% 60))

## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.

dlydt

Plot the heatmap showing delay flight

ggplot(dlydt, aes(day, month, fill = delay_mn))+
                geom_tile()+theme_minimal()+
     scale_fill_gradient2(low = "#075AFF",
                          mid = "#F2F2F2",
                         high = "#FF0000")+
  guides(fill = guide_colourbar(title = "Delay(mins)",
                                barwidth = 0.5,
                                barheight = 15))+ 
  labs( title = "Flight Delay in New Yorks 2013 ",
        x= 'Day',
        y= "Month",
        subtitle = "Heatmap of average flight delay in minute",
        caption = "Source: ggplot package, nycflight13")

ggplot(weather, aes(temp, fill = month)) + 
  geom_histogram(color = 'black', bins = 20) +
  facet_wrap(~ month, nrow = 4)+theme_minimal()+
  scale_fill_viridis(option = "D")+theme(legend.position="none")+
  labs(x ='Temperature(F)', y ='',
       subtitle = "Histrogram monthly temperature in New York 2013",
       caption = "Source: ggplot package, nycflight13, weather")

manuf <- d2 %>% filter(manufacturer %in% c('BOEING','AIRBUS',"AIRBUS INDUSTRIE"))

ggplot((sample_n(manuf, 1000)), aes(dep_delay, arr_delay, 
                                    color = manufacturer)) +
        geom_point(alpha = 0.45) +
        facet_wrap(~ airport_name, ncol = 1)+
        theme_minimal()+labs(title = "Flight Delay in New Yorks 2013 ",
        color ='Air caft Manufacturer :' ,
        x = 'Departure Delay',
        y = "Arrival Delay",
        subtitle = "Relationship between flight arrival and departure delay",
        caption = "Source: ggplot package, nycflight13")+
        theme(legend.position='bottom')

ggplot(sample_n(d2, 15000),aes(air_time, airport_name, fill = stat(x)))+ 
        geom_density_ridges_gradient(scale = 0.85)+
        scale_fill_viridis_c(option="D", alpha = 0.85)+
        theme_minimal()+labs(title = 'Flight Duration NY 2013')+
        theme(legend.position="none")+ylab('') +xlab('Flight Duration (mins)')

## Picking joint bandwidth of 13.5

Exploratory Data Analysis And Visualization R

Sirawit N.

Using Diamonds Dataset

Import library

Checking missing vlue in the data set

let’s explore some data using ggplot2

Using nycflight13 Dataset

Plot the heatmap showing delay flight