This tutorial is to provide basic knowledge and applications of R for data analysis. You will see how useful and interesting R can do with your data. I am still a R learner, so this tutorial is based on Hadly Weckham, one of the smartest R programmers, and you can find his tutorials here
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
data(mpg)
df<-mpg
head(df)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv cty hwy fl
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p
## # ... with 1 more variables: class <chr>
*Simple Scatter Plot
# Scatter plot
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(color=4,size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))
*Adding another categorical variable to the two dimensional plot
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(size=2,shape=16, aes(color=class)) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))
# Display all points
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(size=2,shape=16, aes(color=class),position = "jitter") + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))
*Using different types of color and shapes with categorical and continuous variables
library(viridis)
## Loading required package: viridisLite
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=cty),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold"))
*Facet_wrap
df$model<-as.factor(df$model)
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_wrap(~class)
*Facet_grid
# Plot scaters for different classes, and organized in column form (one row)
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_grid(.~class)
# Organizing plots in row line (one column)
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_grid(class~.)
*Both row and column organization
ggplot(data=df, aes(x=displ,y=hwy)) + geom_point(aes(color=class),size=2,shape=16) + labs(x="Car Engine Size", y="Fuel Efficiency", title="Scatter Plot") + theme_bw()+ theme(plot.title = element_text(hjust=0.5, size = 15,face = "bold")) + facet_grid(drv~class, scales = "fixed")
*Using geom_smooth method
ggplot(data=df) + geom_point(aes(x=displ,y=hwy,color=drv))+geom_smooth(aes(x=displ,y=hwy,color=drv)) + labs(x="Car Engine Size",y="Fuel Efficiency",title="Scatter plot ") + theme_bw() +theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using method = 'loess'
ggplot(data=df) + geom_point(aes(x=displ,y=hwy,color=drv))+geom_smooth(aes(x=displ,y=hwy)) + labs(x="Car Engine Size",y="Fuel Efficiency",title="Scatter plot ") + theme_bw() +theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using method = 'loess'
# Remove the 95% of confidence using se=F
ggplot(data=df) + geom_point(aes(x=displ,y=hwy,color=drv))+geom_smooth(data=filter(df,class=="midsize"),aes(x=displ,y=hwy),se=F) + labs(x="Car Engine Size",y="Fuel Efficiency",title="Scatter plot ") + theme_bw() +theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using method = 'loess'
# Statistical Transformations
# Working with diamond dataset
library(ggplot2)
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
*Geom_bar
ggplot(data = diamonds,aes(x=cut,fill=cut)) + geom_bar()
*Stat_count()
ggplot(data = diamonds,aes(x=cut)) + stat_count(color=4,fill=3) + labs(x="Cut",y="Frequency/Count",title="Bar Plot") + theme(plot.title = element_text(hjust=0.5))
*Geom_proportion
# This plot showed the proportion of cut types
ggplot(data=diamonds) + geom_bar(aes(x=cut,y=..prop..,group=1),color=4,fill=3)
*Stat_Summary
ggplot(data=diamonds) + stat_summary(aes(x=cut,y=depth,color=cut),fun.ymin = min,fun.ymax = max,fun.y = mean) + labs(x="Cut Type",y="Depth",title="Errorbar Plot") + theme(plot.title = element_text(hjust=0.5))
*Geom_col
ggplot(data=diamonds,aes(x=cut,y=price)) + geom_col(fill=4)
*Position and adjustments of bar plot
# Stacked bar plot
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=clarity))
# Unstack bar plot
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=clarity),position = "dodge")
# Using fill argument
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=clarity),position = "fill",alpha=0.4)
*Boxplot
ggplot(data=diamonds,aes(x=cut,y=price)) + geom_boxplot(fill=4)
ggplot(data=diamonds,aes(x=cut,y=price)) + geom_boxplot(aes(fill=cut)) + coord_flip()
*Some interesting plots
ggplot(data=diamonds) + geom_bar(aes(x=cut,fill=cut)) +coord_polar()
#Data Transformation
This section is to work with nycflights13
dataset and use dplyr
package for data manipulation
library(dplyr)
library(nycflights13)
flight<-flights
dim(flight)
## [1] 336776 19
head(flight)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
*Some dplyr
aurguments – filter() for selecting rows by values –select() for selecting columns by names –arrange() for reordering the values –mutate() for creating a new variable from existing variables –summarise() for producing summary statistics
# Interesting things
sqrt(2)^2==2 # Give False
## [1] FALSE
near(sqrt(2)^2,2) # This one give True as computer is stored number as approximation
## [1] TRUE
*Find all flights departed in May and June
df_month5<- flight%>% filter(month==5|month==6) # Approach 1
df_month6<- flight %>% filter(month %in% c(5,6)) # Alternative
head(df_month6)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 5 1 9 1655 434 308
## 2 2013 5 1 451 500 -9 641
## 3 2013 5 1 537 540 -3 836
## 4 2013 5 1 544 545 -1 818
## 5 2013 5 1 548 600 -12 831
## 6 2013 5 1 549 600 -11 804
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
df_not56<- flight %>% filter(!(month==5|month==6))
head(df_not56)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
*Count missing values
sum(is.na(flight$dep_delay)) # Count number of missing values
## [1] 8255
head(flight[is.na(flight$dep_delay),]) # Display missing values
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
*Arrange
# Increasing order
flight_arrange<- flight %>% arrange(month,day)
head(flight_arrange)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
#Decreasing order
flight_dec<- flight %>% arrange(desc(dep_time))
head(flight_dec)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 10 30 2400 2359 1 327
## 2 2013 11 27 2400 2359 1 515
## 3 2013 12 5 2400 2359 1 427
## 4 2013 12 9 2400 2359 1 432
## 5 2013 12 9 2400 2250 70 59
## 6 2013 12 13 2400 2359 1 432
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flight_NA<- flight %>% arrange(desc(is.na(dep_time)),desc(is.na(dep_delay)))
head(flight_NA)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
*Select
# Select all variables, except for month and day
flight_select1<- flight %>% select(-c(month,day))
head(flight_select1)
## # A tibble: 6 x 17
## year dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <dbl> <int> <int>
## 1 2013 517 515 2 830 819
## 2 2013 533 529 4 850 830
## 3 2013 542 540 2 923 850
## 4 2013 544 545 -1 1004 1022
## 5 2013 554 600 -6 812 837
## 6 2013 554 558 -4 740 728
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Select all variables contain `dep`
flight_dep<-flight %>% select(contains("dep"))
head(flight_dep)
## # A tibble: 6 x 3
## dep_time sched_dep_time dep_delay
## <int> <int> <dbl>
## 1 517 515 2
## 2 533 529 4
## 3 542 540 2
## 4 544 545 -1
## 5 554 600 -6
## 6 554 558 -4
# Select variables and reorganize variable position
flight_reorganize<- flight %>% select(dep_time,dep_delay,everything())
head(flight_reorganize)
## # A tibble: 6 x 19
## dep_time dep_delay year month day sched_dep_time arr_time
## <int> <dbl> <int> <int> <int> <int> <int>
## 1 517 2 2013 1 1 515 830
## 2 533 4 2013 1 1 529 850
## 3 542 2 2013 1 1 540 923
## 4 544 -1 2013 1 1 545 1004
## 5 554 -6 2013 1 1 600 812
## 6 554 -4 2013 1 1 558 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
# Select dep_time, dep_delay, arr_time, and arr_delay
flight_s1<- flight %>% select(starts_with("dep"),starts_with("arr"))
head(flight_s1)
## # A tibble: 6 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
Select a certain number of variables from all variables
flight_oneof<- flight %>% select(one_of(c("month","day","dep_delay","arr_time")))
head(flight_oneof)
## # A tibble: 6 x 4
## month day dep_delay arr_time
## <int> <int> <dbl> <int>
## 1 1 1 2 830
## 2 1 1 4 850
## 3 1 1 2 923
## 4 1 1 -1 1004
## 5 1 1 -6 812
## 6 1 1 -4 740
*Mutate
flight_new_column<-flight %>% mutate(speed=distance/air_time*60,gain=arr_delay-dep_delay)
head(flight_new_column)
## # A tibble: 6 x 21
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 14 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, speed <dbl>, gain <dbl>
Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().
delay_flight<- min_rank(desc(flight$dep_delay))
head(delay_flight)
## [1] 114150 103893 114150 144947 258934 209494
*sUMMARISE
summarise_all<-flight %>% group_by(month,day) %>% summarise(delay=mean(dep_delay,na.rm=T),count=n(),arr_delay=mean(arr_delay,na.rm=T))
head(summarise_all)
## # A tibble: 6 x 5
## # Groups: month [1]
## month day delay count arr_delay
## <int> <int> <dbl> <int> <dbl>
## 1 1 1 11.548926 842 12.651023
## 2 1 2 13.858824 943 12.692888
## 3 1 3 10.987832 914 5.733333
## 4 1 4 8.951595 915 -1.932819
## 5 1 5 5.732218 720 -1.525802
## 6 1 6 7.148014 832 4.236429
# Remove all NA values and then calculate mean
summarise_not_na<-flight %>% filter(!is.na(dep_time),!is.na(sched_dep_time))%>% group_by(carrier,dest) %>% summarise(mean_deptime=mean(dep_time),mean_sch_time=mean(sched_dep_time))
head(summarise_not_na)
## # A tibble: 6 x 4
## # Groups: carrier [1]
## carrier dest mean_deptime mean_sch_time
## <chr> <chr> <dbl> <dbl>
## 1 9E ATL 636.8421 636.5789
## 2 9E AUS 1694.0000 1655.0000
## 3 9E AVL 974.3000 976.9000
## 4 9E BGR 2233.0000 2159.0000
## 5 9E BNA 1588.4185 1562.4537
## 6 9E BOS 1341.4384 1323.7977
*Calculate distinct number of factors
distinct1<-flight %>% group_by(dest)%>% summarise(disctin=n_distinct(carrier))
head(distinct1)
## # A tibble: 6 x 2
## dest disctin
## <chr> <int>
## 1 ABQ 1
## 2 ACK 1
## 3 ALB 1
## 4 ANC 1
## 5 ATL 7
## 6 AUS 6
Summing up things less than a threshold. Similarly you can do for mean
, sd
and max
….
sum_1<- flight %>% group_by(carrier) %>%summarise(sum1=sum(dep_time<300,na.rm=T))
head(sum_1)
## # A tibble: 6 x 2
## carrier sum1
## <chr> <int>
## 1 9E 17
## 2 AA 18
## 3 AS 0
## 4 B6 793
## 5 DL 59
## 6 EV 141
sum_missing<-flight %>% summarise(sum_missing=sum(is.na(dep_delay)))
head(sum_missing) # There are 8255 missing values or missing observation in dep_delay variable
## # A tibble: 1 x 1
## sum_missing
## <int>
## 1 8255
*Ungroup if no keeping required
df4<- flight %>% group_by(carrier) %>% mutate(Air_Timehour=arr_time/60) %>% ungroup()
head(df4)
## # A tibble: 6 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, Air_Timehour <dbl>
*Select non-missing values
df_non_missing<- flight %>% filter(!is.na(dest),!is.na(dep_delay))
head(df_non_missing)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
EDA is basically to look at two types of questions
What type of variation occurs within my variables?
What type of covariation occurs between my variables?
-Visualizing the categorical variables
# We can count the number of each facto in one categorical variable
diamonds %>% count(cut) # Alternative use of summarise
## # A tibble: 5 x 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
ggplot(data=diamonds,aes(x=cut)) + geom_bar(color=4,fill=3)
*Interval for numeric variable
d1<- diamonds %>% count(cut_width(price,100))
head(d1) # The number of values falling within a defined range
## # A tibble: 6 x 2
## `cut_width(price, 100)` n
## <fctr> <int>
## 1 [250,350] 17
## 2 (350,450] 876
## 3 (450,550] 1930
## 4 (550,650] 2717
## 5 (650,750] 2888
## 6 (750,850] 2696
carat_small <-diamonds %>%filter(carat<2)
ggplot(data=carat_small,aes(x=carat)) + geom_histogram(binwidth = 0.05,fill=4,color=3)
-Spotting unusual values
ggplot(data=diamonds,aes(x=y)) + geom_histogram(binwidth = 0.05,fill=4) + coord_cartesian(ylim = c(0,50))
ggplot(data=diamonds,aes(x=y)) + geom_histogram(binwidth = 0.05,fill=4) + coord_cartesian(ylim = c(0,10),xlim=c(15,60))
# Filter an interval of value
d_interval<- diamonds %>% filter(between(y,0.01,20)) %>% select(price,x,y,z) %>% arrange(desc(y))
head(d_interval)
## # A tibble: 6 x 4
## price x y z
## <int> <dbl> <dbl> <dbl>
## 1 18018 10.74 10.54 6.98
## 2 18531 10.23 10.16 6.72
## 3 15223 10.14 10.10 6.17
## 4 15223 10.02 9.94 6.24
## 5 15984 10.01 9.94 6.31
## 6 17329 10.00 9.85 6.43
d_unusual<- diamonds %>% filter(y<2|y>20) %>% select(price,x,y,z,everything()) %>% arrange(y)
head(d_unusual) # How could length, width and height of a diamond be rezo
## # A tibble: 6 x 10
## price x y z carat cut color clarity depth table
## <int> <dbl> <dbl> <dbl> <dbl> <ord> <ord> <ord> <dbl> <dbl>
## 1 5139 0 0 0 1.00 Very Good H VS2 63.3 53
## 2 6381 0 0 0 1.14 Fair G VS1 57.5 67
## 3 12800 0 0 0 1.56 Ideal G VS2 62.2 54
## 4 15686 0 0 0 1.20 Premium D VVS1 62.1 59
## 5 18034 0 0 0 2.25 Premium H SI2 62.8 59
## 6 2130 0 0 0 0.71 Good F SI2 64.1 60
*Replacing implausible values by NA
d_newvariable<- diamonds %>% mutate(New_y=ifelse(y==0|y>20,NA,y)) %>% arrange(y)
head(d_newvariable)
## # A tibble: 6 x 11
## carat cut color clarity depth table price x y z New_y
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1.00 Very Good H VS2 63.3 53 5139 0 0 0 NA
## 2 1.14 Fair G VS1 57.5 67 6381 0 0 0 NA
## 3 1.56 Ideal G VS2 62.2 54 12800 0 0 0 NA
## 4 1.20 Premium D VVS1 62.1 59 15686 0 0 0 NA
## 5 2.25 Premium H SI2 62.8 59 18034 0 0 0 NA
## 6 0.71 Good F SI2 64.1 60 2130 0 0 0 NA
*Boxplot
ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy,fill=class))
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1),fill=cut))
*Visualizing the covariance between two categorical variables
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color),color=4)
# Another way around
co<- diamonds %>% count(color,cut)
head(co)
## # A tibble: 6 x 3
## color cut n
## <ord> <ord> <int>
## 1 D Fair 163
## 2 D Good 662
## 3 D Very Good 1513
## 4 D Premium 1603
## 5 D Ideal 2834
## 6 E Fair 224
ggplot(data = co) + geom_tile(aes(x=cut,y=color,fill=n))