library(ggplot2)
#library(datasets)
#data("diamonds")
library(readxl)
diamonds <- read_excel("C:/Users/USER/Documents/kaggle_data_set/Diamonds.xlsx")

explore data

attach(diamonds)
str(diamonds)
## tibble [219,704 × 27] (S3: tbl_df/tbl/data.frame)
##  $ diamond_id                 : num [1:219704] 1.31e+08 1.32e+08 1.32e+08 1.32e+08 1.32e+08 ...
##  $ shape                      : chr [1:219704] "Cushion" "Cushion" "Marquise" "Pear" ...
##  $ size                       : num [1:219704] 5.57 0.42 0.84 17.26 2.32 ...
##  $ color                      : chr [1:219704] "J" "K" "I" "K" ...
##  $ fancy_color_dominant_color : chr [1:219704] "Blue" "Pink" "Black" "Pink" ...
##  $ fancy_color_secondary_color: chr [1:219704] "Orange" "Orange" "Pink" "Purple" ...
##  $ fancy_color_overtone       : chr [1:219704] "Pinkish" "Yellowish" "Yellowish" "Orangey" ...
##  $ fancy_color_intensity      : chr [1:219704] "Fancy Dark" "Fancy Vivid" "Fancy Vivid" "Very Light" ...
##  $ clarity                    : chr [1:219704] "VVS1" "SI1" "SI3" "I2" ...
##  $ cut                        : chr [1:219704] "Poor" "Excellent" "Fair" "Very Good" ...
##  $ symmetry                   : chr [1:219704] "Very Good" "Very Good" "Very Good" "Good" ...
##  $ polish                     : chr [1:219704] "Fair" "Good" "Excellent" "Poor" ...
##  $ depth_percent              : num [1:219704] 8.4 9.5 1.2 8.9 9.4 0.1 9.4 6 7.3 2.4 ...
##  $ table_percent              : num [1:219704] 1.4 3.6 8.6 1.3 1.8 3.3 0.6 0.2 1.2 4.6 ...
##  $ meas_length                : num [1:219704] 8.14 3.88 5.11 4.64 3.63 6.73 4.49 7.33 3.76 8.76 ...
##  $ meas_width                 : num [1:219704] 11.33 13.56 12.12 16.45 9.14 ...
##  $ meas_depth                 : num [1:219704] 14.2 10.2 12 14.6 16.5 ...
##  $ girdle_min                 : chr [1:219704] "M" "VTN" "TN" "STN" ...
##  $ girdle_max                 : chr [1:219704] "VTK" "VTK" "STK" "TK" ...
##  $ culet_size                 : chr [1:219704] "SL" "N" "EL" "L" ...
##  $ culet_condition            : chr [1:219704] "Blue" "White" "Orange" "Green" ...
##  $ fluor_color                : chr [1:219704] "Blue" "Green" "White" "Yellow" ...
##  $ fluor_intensity            : chr [1:219704] "Faint" "Slight" "Very Strong" "Very Strong" ...
##  $ lab                        : chr [1:219704] "GIA" "HRD" "HRD" "GIA" ...
##  $ total_sales_price          : num [1:219704] 684153 1276836 1274951 480725 1273331 ...
##  $ eye_clean                  : chr [1:219704] "E1" "Yes" "No" "Yes" ...
##  $ date                       : POSIXct[1:219704], format: "2014-12-22" "2020-12-01" ...
head(diamonds)

subset

sub<-diamonds[seq(1,200000,by=10000),]
g1<-ggplot(sub,aes(meas_length,meas_width))+geom_point()+geom_smooth(method = "lm",sd=F)+facet_grid(~cut)+theme_minimal()+scale_color_brewer(palette = "Dark2")
## Warning in geom_smooth(method = "lm", sd = F): Ignoring unknown parameters:
## `sd`
g1
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

g2<-ggplot(sub,aes(meas_length))+geom_boxplot()+facet_wrap(~cut,nrow = 1)
g2

boxplot shows there are outliers remove them as below

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sub<-diamonds[seq(1,15000,120),]
g3<-ggplot(sub,aes(meas_length,meas_width))+geom_point()+geom_smooth(method = "lm",sd=F)+facet_grid(~cut)+theme_minimal()+scale_color_brewer(palette = "Dark2")
## Warning in geom_smooth(method = "lm", sd = F): Ignoring unknown parameters:
## `sd`
g4<-ggplot(sub,aes(meas_length))+geom_boxplot()#+facet_wrap(~cut,nrow = 1)
g4

library(ggpubr)
ar<-ggarrange(g3,g4,nrow = 2,labels = "AUTO")
## `geom_smooth()` using formula = 'y ~ x'
ar

library(ggmice)
data(cars)
my.formula=y~poly(x,2)
#M<-ggplot(cars,aes(speed,dist))+geom_point()+geom_smooth(data=subset(cars,speed>15),formula = my.formula,method ="lm",se=T,level=0.9,colour="black")+geom_smooth(data=subset(cars,speed<15),formula = my.formula,method = "lm",se=T,level=0.9)+Stat_Poly_eq(data=subset(cars,speed>15,formula=my.formula,aes(label=paste(label=paste(..eq.label..,..rr.label..,sep = "~~~",parse=T,size=3,colour="black")))))
library(ggplot2)
library(palmerpenguins)
library(dplyr)

data exploration

str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
names(penguins)
## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "year"
summary(penguins)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
##  Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
##                                  Mean   :43.92   Mean   :17.15  
##                                  3rd Qu.:48.50   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##                                  NA's   :2       NA's   :2      
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197.0     Median :4050   NA's  : 11   Median :2008  
##  Mean   :200.9     Mean   :4202                Mean   :2008  
##  3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009  
##  NA's   :2         NA's   :2

data cleaning there are NA values which need to be removed as below

summary(penguins<-na.omit(penguins))
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

The data now is clean from Na values complete cases

summary(penguins<-penguins[complete.cases(penguins),])
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

remove duplicate cases

summary(penguins<-penguins%>%distinct())
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

there were no duplicate cases. remove missing values

scatter_plot<-penguins%>%ggplot(aes(bill_length_mm,bill_depth_mm,color=species))+geom_point()+geom_smooth(method = "lm",se=F)+scale_color_brewer(palette = "Dark2")+labs(x="Bill Length(mm)",y="Bill Depth(mm)",color="species")+theme_minimal()
scatter_plot
## `geom_smooth()` using formula = 'y ~ x'