library(ggplot2)
#library(datasets)
#data("diamonds")
library(readxl)
diamonds <- read_excel("C:/Users/USER/Documents/kaggle_data_set/Diamonds.xlsx")
explore data
attach(diamonds)
str(diamonds)
## tibble [219,704 × 27] (S3: tbl_df/tbl/data.frame)
## $ diamond_id : num [1:219704] 1.31e+08 1.32e+08 1.32e+08 1.32e+08 1.32e+08 ...
## $ shape : chr [1:219704] "Cushion" "Cushion" "Marquise" "Pear" ...
## $ size : num [1:219704] 5.57 0.42 0.84 17.26 2.32 ...
## $ color : chr [1:219704] "J" "K" "I" "K" ...
## $ fancy_color_dominant_color : chr [1:219704] "Blue" "Pink" "Black" "Pink" ...
## $ fancy_color_secondary_color: chr [1:219704] "Orange" "Orange" "Pink" "Purple" ...
## $ fancy_color_overtone : chr [1:219704] "Pinkish" "Yellowish" "Yellowish" "Orangey" ...
## $ fancy_color_intensity : chr [1:219704] "Fancy Dark" "Fancy Vivid" "Fancy Vivid" "Very Light" ...
## $ clarity : chr [1:219704] "VVS1" "SI1" "SI3" "I2" ...
## $ cut : chr [1:219704] "Poor" "Excellent" "Fair" "Very Good" ...
## $ symmetry : chr [1:219704] "Very Good" "Very Good" "Very Good" "Good" ...
## $ polish : chr [1:219704] "Fair" "Good" "Excellent" "Poor" ...
## $ depth_percent : num [1:219704] 8.4 9.5 1.2 8.9 9.4 0.1 9.4 6 7.3 2.4 ...
## $ table_percent : num [1:219704] 1.4 3.6 8.6 1.3 1.8 3.3 0.6 0.2 1.2 4.6 ...
## $ meas_length : num [1:219704] 8.14 3.88 5.11 4.64 3.63 6.73 4.49 7.33 3.76 8.76 ...
## $ meas_width : num [1:219704] 11.33 13.56 12.12 16.45 9.14 ...
## $ meas_depth : num [1:219704] 14.2 10.2 12 14.6 16.5 ...
## $ girdle_min : chr [1:219704] "M" "VTN" "TN" "STN" ...
## $ girdle_max : chr [1:219704] "VTK" "VTK" "STK" "TK" ...
## $ culet_size : chr [1:219704] "SL" "N" "EL" "L" ...
## $ culet_condition : chr [1:219704] "Blue" "White" "Orange" "Green" ...
## $ fluor_color : chr [1:219704] "Blue" "Green" "White" "Yellow" ...
## $ fluor_intensity : chr [1:219704] "Faint" "Slight" "Very Strong" "Very Strong" ...
## $ lab : chr [1:219704] "GIA" "HRD" "HRD" "GIA" ...
## $ total_sales_price : num [1:219704] 684153 1276836 1274951 480725 1273331 ...
## $ eye_clean : chr [1:219704] "E1" "Yes" "No" "Yes" ...
## $ date : POSIXct[1:219704], format: "2014-12-22" "2020-12-01" ...
head(diamonds)
subset
sub<-diamonds[seq(1,200000,by=10000),]
g1<-ggplot(sub,aes(meas_length,meas_width))+geom_point()+geom_smooth(method = "lm",sd=F)+facet_grid(~cut)+theme_minimal()+scale_color_brewer(palette = "Dark2")
## Warning in geom_smooth(method = "lm", sd = F): Ignoring unknown parameters:
## `sd`
g1
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
g2<-ggplot(sub,aes(meas_length))+geom_boxplot()+facet_wrap(~cut,nrow = 1)
g2
boxplot shows there are outliers remove them as below
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sub<-diamonds[seq(1,15000,120),]
g3<-ggplot(sub,aes(meas_length,meas_width))+geom_point()+geom_smooth(method = "lm",sd=F)+facet_grid(~cut)+theme_minimal()+scale_color_brewer(palette = "Dark2")
## Warning in geom_smooth(method = "lm", sd = F): Ignoring unknown parameters:
## `sd`
g4<-ggplot(sub,aes(meas_length))+geom_boxplot()#+facet_wrap(~cut,nrow = 1)
g4
library(ggpubr)
ar<-ggarrange(g3,g4,nrow = 2,labels = "AUTO")
## `geom_smooth()` using formula = 'y ~ x'
ar
library(ggmice)
data(cars)
my.formula=y~poly(x,2)
#M<-ggplot(cars,aes(speed,dist))+geom_point()+geom_smooth(data=subset(cars,speed>15),formula = my.formula,method ="lm",se=T,level=0.9,colour="black")+geom_smooth(data=subset(cars,speed<15),formula = my.formula,method = "lm",se=T,level=0.9)+Stat_Poly_eq(data=subset(cars,speed>15,formula=my.formula,aes(label=paste(label=paste(..eq.label..,..rr.label..,sep = "~~~",parse=T,size=3,colour="black")))))
library(ggplot2)
library(palmerpenguins)
library(dplyr)
data exploration
str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
names(penguins)
## [1] "species" "island" "bill_length_mm"
## [4] "bill_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex" "year"
summary(penguins)
## species island bill_length_mm bill_depth_mm
## Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :124 1st Qu.:39.23 1st Qu.:15.60
## Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30
## Mean :43.92 Mean :17.15
## 3rd Qu.:48.50 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## NA's :2 NA's :2
## flipper_length_mm body_mass_g sex year
## Min. :172.0 Min. :2700 female:165 Min. :2007
## 1st Qu.:190.0 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197.0 Median :4050 NA's : 11 Median :2008
## Mean :200.9 Mean :4202 Mean :2008
## 3rd Qu.:213.0 3rd Qu.:4750 3rd Qu.:2009
## Max. :231.0 Max. :6300 Max. :2009
## NA's :2 NA's :2
data cleaning there are NA values which need to be removed as below
summary(penguins<-na.omit(penguins))
## species island bill_length_mm bill_depth_mm
## Adelie :146 Biscoe :163 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :123 1st Qu.:39.50 1st Qu.:15.60
## Gentoo :119 Torgersen: 47 Median :44.50 Median :17.30
## Mean :43.99 Mean :17.16
## 3rd Qu.:48.60 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## flipper_length_mm body_mass_g sex year
## Min. :172 Min. :2700 female:165 Min. :2007
## 1st Qu.:190 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197 Median :4050 Median :2008
## Mean :201 Mean :4207 Mean :2008
## 3rd Qu.:213 3rd Qu.:4775 3rd Qu.:2009
## Max. :231 Max. :6300 Max. :2009
The data now is clean from Na values complete cases
summary(penguins<-penguins[complete.cases(penguins),])
## species island bill_length_mm bill_depth_mm
## Adelie :146 Biscoe :163 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :123 1st Qu.:39.50 1st Qu.:15.60
## Gentoo :119 Torgersen: 47 Median :44.50 Median :17.30
## Mean :43.99 Mean :17.16
## 3rd Qu.:48.60 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## flipper_length_mm body_mass_g sex year
## Min. :172 Min. :2700 female:165 Min. :2007
## 1st Qu.:190 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197 Median :4050 Median :2008
## Mean :201 Mean :4207 Mean :2008
## 3rd Qu.:213 3rd Qu.:4775 3rd Qu.:2009
## Max. :231 Max. :6300 Max. :2009
remove duplicate cases
summary(penguins<-penguins%>%distinct())
## species island bill_length_mm bill_depth_mm
## Adelie :146 Biscoe :163 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :123 1st Qu.:39.50 1st Qu.:15.60
## Gentoo :119 Torgersen: 47 Median :44.50 Median :17.30
## Mean :43.99 Mean :17.16
## 3rd Qu.:48.60 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## flipper_length_mm body_mass_g sex year
## Min. :172 Min. :2700 female:165 Min. :2007
## 1st Qu.:190 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197 Median :4050 Median :2008
## Mean :201 Mean :4207 Mean :2008
## 3rd Qu.:213 3rd Qu.:4775 3rd Qu.:2009
## Max. :231 Max. :6300 Max. :2009
there were no duplicate cases. remove missing values
scatter_plot<-penguins%>%ggplot(aes(bill_length_mm,bill_depth_mm,color=species))+geom_point()+geom_smooth(method = "lm",se=F)+scale_color_brewer(palette = "Dark2")+labs(x="Bill Length(mm)",y="Bill Depth(mm)",color="species")+theme_minimal()
scatter_plot
## `geom_smooth()` using formula = 'y ~ x'
analysis of dataset called salaries
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forcats)
data(Salaries)
str(Salaries)
## 'data.frame': 397 obs. of 6 variables:
## $ rank : Factor w/ 3 levels "AsstProf","AssocProf",..: 3 3 1 3 3 2 3 3 3 3 ...
## $ discipline : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
## $ yrs.since.phd: int 19 20 4 45 40 6 30 45 21 18 ...
## $ yrs.service : int 18 16 3 39 41 6 23 45 20 18 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
## $ salary : int 139750 173200 79750 115000 141500 97000 175000 147765 119250 129000 ...
head(Salaries)
names(Salaries)
## [1] "rank" "discipline" "yrs.since.phd" "yrs.service"
## [5] "sex" "salary"
visualization of Salaries data scatter plot
plot<-Salaries%>%ggplot(aes(yrs.since.phd,salary))+geom_jitter(aes(color=rank,shape=discipline))+geom_smooth(method=lm,se=0)+facet_wrap(~sex)+labs(tittle= 'salary against years',x="salary",y="years")+theme_minimal()
plot
## `geom_smooth()` using formula = 'y ~ x'
s<-Salaries%>%filter(salary<150000)%>%ggplot(aes(x=rank,y=salary,fill=sex))+geom_boxplot(alpha=0.5)+scale_x_discrete(breaks=c("AsstProf","AssocProf","Prof"),labels=c("assistant\nprofessor","associate\nprofessor","full\nprofessor"))+scale_y_continuous(breaks=c(50000,100000,150000,200000),labels=c("$50k","$100k","$150","$200"))+theme(legend.position = c(.11,.78))
s
theme_set(theme_bw()+theme(title=element_text(color = "steelblue",face="bold")))
P<-Salaries%>%mutate(discipline=factor(discipline,levels=c("A","B"),labels=c("Agriculture","Biology")))%>%ggplot(aes(yrs.since.phd,salary))+geom_point(aes(color=rank,alpha=0.5))+geom_smooth()+facet_grid(sex~discipline)+labs(title="salary vs years",x="No of years",y="salary",color="position")
P
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'