This tutorial is to provide basic knowledge and applications of R for data analysis. You will see how useful and interesting R can do with your data. I am still a R learner, so this tutorial is based on Hadly Weckham, one of the smartest R programmers, and you can find his tutorials here

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

data(mpg)

df<-mpg

head(df)

## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl      trans   drv   cty   hwy    fl
##          <chr> <chr> <dbl> <int> <int>      <chr> <chr> <int> <int> <chr>
## 1         audi    a4   1.8  1999     4   auto(l5)     f    18    29     p
## 2         audi    a4   1.8  1999     4 manual(m5)     f    21    29     p
## 3         audi    a4   2.0  2008     4 manual(m6)     f    20    31     p
## 4         audi    a4   2.0  2008     4   auto(av)     f    21    30     p
## 5         audi    a4   2.8  1999     6   auto(l5)     f    16    26     p
## 6         audi    a4   2.8  1999     6 manual(m5)     f    18    26     p
## # ... with 1 more variables: class <chr>

*Simple Scatter Plot

# Scatter plot 

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(color=4,size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))

*Adding another categorical variable to the two dimensional plot

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(size=2,shape=16, aes(color=class)) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))

# Display all points

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(size=2,shape=16, aes(color=class),position = "jitter") + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))

*Using different types of color and shapes with categorical and continuous variables

library(viridis)

## Loading required package: viridisLite

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=cty),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))

*Facet_wrap

df$model<-as.factor(df$model)

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_wrap(~class)

*Facet_grid

# Plot scaters for different classes, and organized in column form (one row)

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_grid(.~class)

# Organizing plots in row line (one column)

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_grid(class~.)

*Both row and column organization

ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_grid(drv~class, scales = "fixed")

*Using geom_smooth method

ggplot(data=df) + geom_point(aes(x=displ,y=hwy,color=drv))+geom_smooth(aes(x=displ,y=hwy,color=drv)) + labs(x="Car Engine Size",y="Fuel Efficiency",title="Scatter plot ") + theme_bw() +theme(plot.title = element_text(hjust=0.5))

## `geom_smooth()` using method = 'loess'

ggplot(data=df) + geom_point(aes(x=displ,y=hwy,color=drv))+geom_smooth(aes(x=displ,y=hwy)) + labs(x="Car Engine Size",y="Fuel Efficiency",title="Scatter plot ") + theme_bw() +theme(plot.title = element_text(hjust=0.5))

## `geom_smooth()` using method = 'loess'

# Remove the 95% of confidence using se=F
ggplot(data=df) + geom_point(aes(x=displ,y=hwy,color=drv))+geom_smooth(data=filter(df,class=="midsize"),aes(x=displ,y=hwy),se=F) + labs(x="Car Engine Size",y="Fuel Efficiency",title="Scatter plot ") + theme_bw() +theme(plot.title = element_text(hjust=0.5))

## `geom_smooth()` using method = 'loess'

# Statistical Transformations

# Working with diamond dataset
library(ggplot2)

head(diamonds)

## # A tibble: 6 x 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48

*Geom_bar

ggplot(data = diamonds,aes(x=cut,fill=cut)) + geom_bar()

*Stat_count()

ggplot(data = diamonds,aes(x=cut)) + stat_count(color=4,fill=3) + labs(x="Cut",y="Frequency/Count",title="Bar Plot") + theme(plot.title = element_text(hjust=0.5))

*Geom_proportion

# This plot showed the proportion of cut types
ggplot(data=diamonds) + geom_bar(aes(x=cut,y=..prop..,group=1),color=4,fill=3)

*Stat_Summary

ggplot(data=diamonds) + stat_summary(aes(x=cut,y=depth,color=cut),fun.ymin = min,fun.ymax = max,fun.y = mean) + labs(x="Cut Type",y="Depth",title="Errorbar Plot") + theme(plot.title = element_text(hjust=0.5))

*Geom_col

ggplot(data=diamonds,aes(x=cut,y=price)) + geom_col(fill=4)

*Position and adjustments of bar plot

# Stacked bar plot
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=clarity))

# Unstack bar plot
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=clarity),position = "dodge")

# Using fill argument 
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=clarity),position = "fill",alpha=0.4)

*Boxplot

ggplot(data=diamonds,aes(x=cut,y=price)) + geom_boxplot(fill=4)

ggplot(data=diamonds,aes(x=cut,y=price)) + geom_boxplot(aes(fill=cut)) + coord_flip()

*Some interesting plots

ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=cut)) +coord_polar()

#Data Transformation

This section is to work with nycflights13 dataset and use dplyr package for data manipulation

library(dplyr)

library(nycflights13)

flight<-flights

dim(flight)

## [1] 336776     19

head(flight)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

*Some dplyr aurguments – filter() for selecting rows by values –select() for selecting columns by names –arrange() for reordering the values –mutate() for creating a new variable from existing variables –summarise() for producing summary statistics

# Interesting things

sqrt(2)^2==2 # Give False

## [1] FALSE

near(sqrt(2)^2,2) # This one give True as computer is stored number as approximation

## [1] TRUE

*Find all flights departed in May and June

df_month5<- flight%>% filter(month==5|month==6) # Approach 1

df_month6<- flight %>% filter(month %in% c(5,6)) # Alternative

head(df_month6)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     5     1        9           1655       434      308
## 2  2013     5     1      451            500        -9      641
## 3  2013     5     1      537            540        -3      836
## 4  2013     5     1      544            545        -1      818
## 5  2013     5     1      548            600       -12      831
## 6  2013     5     1      549            600       -11      804
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Select all flights, except flights departed in May and June

df_not56<- flight %>% filter(!(month==5|month==6))

head(df_not56)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

*Count missing values

sum(is.na(flight$dep_delay)) # Count number of missing values

## [1] 8255

head(flight[is.na(flight$dep_delay),]) # Display  missing values

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1       NA           1630        NA       NA
## 2  2013     1     1       NA           1935        NA       NA
## 3  2013     1     1       NA           1500        NA       NA
## 4  2013     1     1       NA            600        NA       NA
## 5  2013     1     2       NA           1540        NA       NA
## 6  2013     1     2       NA           1620        NA       NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

*Arrange

# Increasing order

flight_arrange<- flight %>% arrange(month,day)

head(flight_arrange)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

#Decreasing order

flight_dec<- flight %>% arrange(desc(dep_time))

head(flight_dec)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013    10    30     2400           2359         1      327
## 2  2013    11    27     2400           2359         1      515
## 3  2013    12     5     2400           2359         1      427
## 4  2013    12     9     2400           2359         1      432
## 5  2013    12     9     2400           2250        70       59
## 6  2013    12    13     2400           2359         1      432
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

flight_NA<- flight %>% arrange(desc(is.na(dep_time)),desc(is.na(dep_delay)))

head(flight_NA)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1       NA           1630        NA       NA
## 2  2013     1     1       NA           1935        NA       NA
## 3  2013     1     1       NA           1500        NA       NA
## 4  2013     1     1       NA            600        NA       NA
## 5  2013     1     2       NA           1540        NA       NA
## 6  2013     1     2       NA           1620        NA       NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

*Select

# Select all variables, except for month and day

flight_select1<- flight %>% select(-c(month,day))

head(flight_select1)

## # A tibble: 6 x 17
##    year dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013      517            515         2      830            819
## 2  2013      533            529         4      850            830
## 3  2013      542            540         2      923            850
## 4  2013      544            545        -1     1004           1022
## 5  2013      554            600        -6      812            837
## 6  2013      554            558        -4      740            728
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

# Select all variables contain `dep`

flight_dep<-flight %>% select(contains("dep"))

head(flight_dep)

## # A tibble: 6 x 3
##   dep_time sched_dep_time dep_delay
##      <int>          <int>     <dbl>
## 1      517            515         2
## 2      533            529         4
## 3      542            540         2
## 4      544            545        -1
## 5      554            600        -6
## 6      554            558        -4

# Select variables and reorganize variable position

flight_reorganize<- flight %>% select(dep_time,dep_delay,everything())

head(flight_reorganize)

## # A tibble: 6 x 19
##   dep_time dep_delay  year month   day sched_dep_time arr_time
##      <int>     <dbl> <int> <int> <int>          <int>    <int>
## 1      517         2  2013     1     1            515      830
## 2      533         4  2013     1     1            529      850
## 3      542         2  2013     1     1            540      923
## 4      544        -1  2013     1     1            545     1004
## 5      554        -6  2013     1     1            600      812
## 6      554        -4  2013     1     1            558      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

# Select dep_time, dep_delay, arr_time, and arr_delay

flight_s1<- flight %>% select(starts_with("dep"),starts_with("arr"))

head(flight_s1)

## # A tibble: 6 x 4
##   dep_time dep_delay arr_time arr_delay
##      <int>     <dbl>    <int>     <dbl>
## 1      517         2      830        11
## 2      533         4      850        20
## 3      542         2      923        33
## 4      544        -1     1004       -18
## 5      554        -6      812       -25
## 6      554        -4      740        12

Select a certain number of variables from all variables

flight_oneof<- flight %>% select(one_of(c("month","day","dep_delay","arr_time")))

head(flight_oneof)

## # A tibble: 6 x 4
##   month   day dep_delay arr_time
##   <int> <int>     <dbl>    <int>
## 1     1     1         2      830
## 2     1     1         4      850
## 3     1     1         2      923
## 4     1     1        -1     1004
## 5     1     1        -6      812
## 6     1     1        -4      740

*Mutate

flight_new_column<-flight %>% mutate(speed=distance/air_time*60,gain=arr_delay-dep_delay)

head(flight_new_column)

## # A tibble: 6 x 21
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 14 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, speed <dbl>, gain <dbl>

Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

delay_flight<- min_rank(desc(flight$dep_delay))

head(delay_flight)

## [1] 114150 103893 114150 144947 258934 209494

*sUMMARISE

summarise_all<-flight %>% group_by(month,day) %>% summarise(delay=mean(dep_delay,na.rm=T),count=n(),arr_delay=mean(arr_delay,na.rm=T))

head(summarise_all)

## # A tibble: 6 x 5
## # Groups:   month [1]
##   month   day     delay count arr_delay
##   <int> <int>     <dbl> <int>     <dbl>
## 1     1     1 11.548926   842 12.651023
## 2     1     2 13.858824   943 12.692888
## 3     1     3 10.987832   914  5.733333
## 4     1     4  8.951595   915 -1.932819
## 5     1     5  5.732218   720 -1.525802
## 6     1     6  7.148014   832  4.236429

 # Remove all NA values and then calculate mean
summarise_not_na<-flight %>% filter(!is.na(dep_time),!is.na(sched_dep_time))%>% group_by(carrier,dest) %>% summarise(mean_deptime=mean(dep_time),mean_sch_time=mean(sched_dep_time))

head(summarise_not_na)

## # A tibble: 6 x 4
## # Groups:   carrier [1]
##   carrier  dest mean_deptime mean_sch_time
##     <chr> <chr>        <dbl>         <dbl>
## 1      9E   ATL     636.8421      636.5789
## 2      9E   AUS    1694.0000     1655.0000
## 3      9E   AVL     974.3000      976.9000
## 4      9E   BGR    2233.0000     2159.0000
## 5      9E   BNA    1588.4185     1562.4537
## 6      9E   BOS    1341.4384     1323.7977

*Calculate distinct number of factors

distinct1<-flight %>% group_by(dest)%>% summarise(disctin=n_distinct(carrier))

head(distinct1)

## # A tibble: 6 x 2
##    dest disctin
##   <chr>   <int>
## 1   ABQ       1
## 2   ACK       1
## 3   ALB       1
## 4   ANC       1
## 5   ATL       7
## 6   AUS       6

Summing up things less than a threshold. Similarly you can do for mean, sd and max….

sum_1<- flight %>% group_by(carrier) %>%summarise(sum1=sum(dep_time<300,na.rm=T))

head(sum_1)

## # A tibble: 6 x 2
##   carrier  sum1
##     <chr> <int>
## 1      9E    17
## 2      AA    18
## 3      AS     0
## 4      B6   793
## 5      DL    59
## 6      EV   141

sum_missing<-flight %>% summarise(sum_missing=sum(is.na(dep_delay)))

head(sum_missing) # There are 8255 missing values or missing observation in dep_delay variable

## # A tibble: 1 x 1
##   sum_missing
##         <int>
## 1        8255

*Ungroup if no keeping required

df4<- flight %>% group_by(carrier) %>% mutate(Air_Timehour=arr_time/60) %>% ungroup()

head(df4)

## # A tibble: 6 x 20
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, Air_Timehour <dbl>

*Select non-missing values

df_non_missing<- flight %>% filter(!is.na(dest),!is.na(dep_delay))

head(df_non_missing)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Exploratory Data Analysis (EDA)

EDA is basically to look at two types of questions

What type of variation occurs within my variables?

What type of covariation occurs between my variables?

-Visualizing the categorical variables

# We can count the number of each facto in one categorical variable

diamonds %>% count(cut) # Alternative use of summarise

## # A tibble: 5 x 2
##         cut     n
##       <ord> <int>
## 1      Fair  1610
## 2      Good  4906
## 3 Very Good 12082
## 4   Premium 13791
## 5     Ideal 21551

ggplot(data=diamonds,aes(x=cut)) + geom_bar(color=4,fill=3)

*Interval for numeric variable

d1<- diamonds %>% count(cut_width(price,100))

head(d1) # The number of values falling within a defined range

## # A tibble: 6 x 2
##   `cut_width(price, 100)`     n
##                    <fctr> <int>
## 1               [250,350]    17
## 2               (350,450]   876
## 3               (450,550]  1930
## 4               (550,650]  2717
## 5               (650,750]  2888
## 6               (750,850]  2696

carat_small <-diamonds %>%filter(carat<2)

ggplot(data=carat_small,aes(x=carat)) + geom_histogram(binwidth = 0.05,fill=4,color=3)

-Spotting unusual values

ggplot(data=diamonds,aes(x=y)) + geom_histogram(binwidth = 0.05,fill=4) + coord_cartesian(ylim = c(0,50))

ggplot(data=diamonds,aes(x=y)) + geom_histogram(binwidth = 0.05,fill=4) + coord_cartesian(ylim = c(0,10),xlim=c(15,60))

# Filter an interval of value
d_interval<- diamonds %>% filter(between(y,0.01,20)) %>% select(price,x,y,z) %>% arrange(desc(y))

head(d_interval)

## # A tibble: 6 x 4
##   price     x     y     z
##   <int> <dbl> <dbl> <dbl>
## 1 18018 10.74 10.54  6.98
## 2 18531 10.23 10.16  6.72
## 3 15223 10.14 10.10  6.17
## 4 15223 10.02  9.94  6.24
## 5 15984 10.01  9.94  6.31
## 6 17329 10.00  9.85  6.43

d_unusual<- diamonds %>% filter(y<2|y>20) %>% select(price,x,y,z,everything()) %>% arrange(y)

head(d_unusual) # How could length, width and height of a diamond be rezo

## # A tibble: 6 x 10
##   price     x     y     z carat       cut color clarity depth table
##   <int> <dbl> <dbl> <dbl> <dbl>     <ord> <ord>   <ord> <dbl> <dbl>
## 1  5139     0     0     0  1.00 Very Good     H     VS2  63.3    53
## 2  6381     0     0     0  1.14      Fair     G     VS1  57.5    67
## 3 12800     0     0     0  1.56     Ideal     G     VS2  62.2    54
## 4 15686     0     0     0  1.20   Premium     D    VVS1  62.1    59
## 5 18034     0     0     0  2.25   Premium     H     SI2  62.8    59
## 6  2130     0     0     0  0.71      Good     F     SI2  64.1    60

*Replacing implausible values by NA

d_newvariable<- diamonds %>% mutate(New_y=ifelse(y==0|y>20,NA,y)) %>% arrange(y)

head(d_newvariable)

## # A tibble: 6 x 11
##   carat       cut color clarity depth table price     x     y     z New_y
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1  1.00 Very Good     H     VS2  63.3    53  5139     0     0     0    NA
## 2  1.14      Fair     G     VS1  57.5    67  6381     0     0     0    NA
## 3  1.56     Ideal     G     VS2  62.2    54 12800     0     0     0    NA
## 4  1.20   Premium     D    VVS1  62.1    59 15686     0     0     0    NA
## 5  2.25   Premium     H     SI2  62.8    59 18034     0     0     0    NA
## 6  0.71      Good     F     SI2  64.1    60  2130     0     0     0    NA

*Boxplot

ggplot(data = mpg) +  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy,fill=class))

ggplot(data = diamonds, mapping = aes(x = carat, y = price)) + 
  geom_boxplot(mapping = aes(group = cut_width(carat, 0.1),fill=cut))

*Visualizing the covariance between two categorical variables

ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color),color=4)

# Another way around

co<- diamonds %>% count(color,cut)

head(co)

## # A tibble: 6 x 3
##   color       cut     n
##   <ord>     <ord> <int>
## 1     D      Fair   163
## 2     D      Good   662
## 3     D Very Good  1513
## 4     D   Premium  1603
## 5     D     Ideal  2834
## 6     E      Fair   224

ggplot(data = co) + geom_tile(aes(x=cut,y=color,fill=n))

R for Data Science

Exploratory Data Analysis (EDA)