VISUALIZATION_GGPLOT2

library(ggplot2)
#library(datasets)
#data("diamonds")
library(readxl)
diamonds <- read_excel("C:/Users/USER/Documents/kaggle_data_set/Diamonds.xlsx")

explore data

attach(diamonds)
str(diamonds)

## tibble [219,704 × 27] (S3: tbl_df/tbl/data.frame)
##  $ diamond_id                 : num [1:219704] 1.31e+08 1.32e+08 1.32e+08 1.32e+08 1.32e+08 ...
##  $ shape                      : chr [1:219704] "Cushion" "Cushion" "Marquise" "Pear" ...
##  $ size                       : num [1:219704] 5.57 0.42 0.84 17.26 2.32 ...
##  $ color                      : chr [1:219704] "J" "K" "I" "K" ...
##  $ fancy_color_dominant_color : chr [1:219704] "Blue" "Pink" "Black" "Pink" ...
##  $ fancy_color_secondary_color: chr [1:219704] "Orange" "Orange" "Pink" "Purple" ...
##  $ fancy_color_overtone       : chr [1:219704] "Pinkish" "Yellowish" "Yellowish" "Orangey" ...
##  $ fancy_color_intensity      : chr [1:219704] "Fancy Dark" "Fancy Vivid" "Fancy Vivid" "Very Light" ...
##  $ clarity                    : chr [1:219704] "VVS1" "SI1" "SI3" "I2" ...
##  $ cut                        : chr [1:219704] "Poor" "Excellent" "Fair" "Very Good" ...
##  $ symmetry                   : chr [1:219704] "Very Good" "Very Good" "Very Good" "Good" ...
##  $ polish                     : chr [1:219704] "Fair" "Good" "Excellent" "Poor" ...
##  $ depth_percent              : num [1:219704] 8.4 9.5 1.2 8.9 9.4 0.1 9.4 6 7.3 2.4 ...
##  $ table_percent              : num [1:219704] 1.4 3.6 8.6 1.3 1.8 3.3 0.6 0.2 1.2 4.6 ...
##  $ meas_length                : num [1:219704] 8.14 3.88 5.11 4.64 3.63 6.73 4.49 7.33 3.76 8.76 ...
##  $ meas_width                 : num [1:219704] 11.33 13.56 12.12 16.45 9.14 ...
##  $ meas_depth                 : num [1:219704] 14.2 10.2 12 14.6 16.5 ...
##  $ girdle_min                 : chr [1:219704] "M" "VTN" "TN" "STN" ...
##  $ girdle_max                 : chr [1:219704] "VTK" "VTK" "STK" "TK" ...
##  $ culet_size                 : chr [1:219704] "SL" "N" "EL" "L" ...
##  $ culet_condition            : chr [1:219704] "Blue" "White" "Orange" "Green" ...
##  $ fluor_color                : chr [1:219704] "Blue" "Green" "White" "Yellow" ...
##  $ fluor_intensity            : chr [1:219704] "Faint" "Slight" "Very Strong" "Very Strong" ...
##  $ lab                        : chr [1:219704] "GIA" "HRD" "HRD" "GIA" ...
##  $ total_sales_price          : num [1:219704] 684153 1276836 1274951 480725 1273331 ...
##  $ eye_clean                  : chr [1:219704] "E1" "Yes" "No" "Yes" ...
##  $ date                       : POSIXct[1:219704], format: "2014-12-22" "2020-12-01" ...

head(diamonds)

subset

sub<-diamonds[seq(1,200000,by=10000),]
g1<-ggplot(sub,aes(meas_length,meas_width))+geom_point()+geom_smooth(method = "lm",sd=F)+facet_grid(~cut)+theme_minimal()+scale_color_brewer(palette = "Dark2")

## Warning in geom_smooth(method = "lm", sd = F): Ignoring unknown parameters:
## `sd`

g1

## `geom_smooth()` using formula = 'y ~ x'

## Warning in qt((1 - level)/2, df): NaNs produced

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

g2<-ggplot(sub,aes(meas_length))+geom_boxplot()+facet_wrap(~cut,nrow = 1)
g2

boxplot shows there are outliers remove them as below

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

sub<-diamonds[seq(1,15000,120),]
g3<-ggplot(sub,aes(meas_length,meas_width))+geom_point()+geom_smooth(method = "lm",sd=F)+facet_grid(~cut)+theme_minimal()+scale_color_brewer(palette = "Dark2")

## Warning in geom_smooth(method = "lm", sd = F): Ignoring unknown parameters:
## `sd`

g4<-ggplot(sub,aes(meas_length))+geom_boxplot()#+facet_wrap(~cut,nrow = 1)
g4

library(ggpubr)
ar<-ggarrange(g3,g4,nrow = 2,labels = "AUTO")

## `geom_smooth()` using formula = 'y ~ x'

ar

library(ggmice)
data(cars)
my.formula=y~poly(x,2)
#M<-ggplot(cars,aes(speed,dist))+geom_point()+geom_smooth(data=subset(cars,speed>15),formula = my.formula,method ="lm",se=T,level=0.9,colour="black")+geom_smooth(data=subset(cars,speed<15),formula = my.formula,method = "lm",se=T,level=0.9)+Stat_Poly_eq(data=subset(cars,speed>15,formula=my.formula,aes(label=paste(label=paste(..eq.label..,..rr.label..,sep = "~~~",parse=T,size=3,colour="black")))))

library(ggplot2)
library(palmerpenguins)
library(dplyr)

data exploration

str(penguins)

## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...

names(penguins)

## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "year"

summary(penguins)

##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
##  Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
##                                  Mean   :43.92   Mean   :17.15  
##                                  3rd Qu.:48.50   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##                                  NA's   :2       NA's   :2      
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197.0     Median :4050   NA's  : 11   Median :2008  
##  Mean   :200.9     Mean   :4202                Mean   :2008  
##  3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009  
##  NA's   :2         NA's   :2

data cleaning there are NA values which need to be removed as below

summary(penguins<-na.omit(penguins))

##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

The data now is clean from Na values complete cases

summary(penguins<-penguins[complete.cases(penguins),])

##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

remove duplicate cases

summary(penguins<-penguins%>%distinct())

##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

there were no duplicate cases. remove missing values

scatter_plot<-penguins%>%ggplot(aes(bill_length_mm,bill_depth_mm,color=species))+geom_point()+geom_smooth(method = "lm",se=F)+scale_color_brewer(palette = "Dark2")+labs(x="Bill Length(mm)",y="Bill Depth(mm)",color="species")+theme_minimal()
scatter_plot

## `geom_smooth()` using formula = 'y ~ x'

analysis of dataset called salaries

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(forcats)
data(Salaries)
str(Salaries)

## 'data.frame':    397 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AsstProf","AssocProf",..: 3 3 1 3 3 2 3 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
##  $ yrs.since.phd: int  19 20 4 45 40 6 30 45 21 18 ...
##  $ yrs.service  : int  18 16 3 39 41 6 23 45 20 18 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ salary       : int  139750 173200 79750 115000 141500 97000 175000 147765 119250 129000 ...

head(Salaries)

names(Salaries)

## [1] "rank"          "discipline"    "yrs.since.phd" "yrs.service"  
## [5] "sex"           "salary"

visualization of Salaries data scatter plot

plot<-Salaries%>%ggplot(aes(yrs.since.phd,salary))+geom_jitter(aes(color=rank,shape=discipline))+geom_smooth(method=lm,se=0)+facet_wrap(~sex)+labs(tittle= 'salary against years',x="salary",y="years")+theme_minimal()
plot

## `geom_smooth()` using formula = 'y ~ x'

s<-Salaries%>%filter(salary<150000)%>%ggplot(aes(x=rank,y=salary,fill=sex))+geom_boxplot(alpha=0.5)+scale_x_discrete(breaks=c("AsstProf","AssocProf","Prof"),labels=c("assistant\nprofessor","associate\nprofessor","full\nprofessor"))+scale_y_continuous(breaks=c(50000,100000,150000,200000),labels=c("$50k","$100k","$150","$200"))+theme(legend.position = c(.11,.78))
s

theme_set(theme_bw()+theme(title=element_text(color = "steelblue",face="bold")))
P<-Salaries%>%mutate(discipline=factor(discipline,levels=c("A","B"),labels=c("Agriculture","Biology")))%>%ggplot(aes(yrs.since.phd,salary))+geom_point(aes(color=rank,alpha=0.5))+geom_smooth()+facet_grid(sex~discipline)+labs(title="salary vs years",x="No of years",y="salary",color="position")
P

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

VISUALIZATION_GGPLOT2_1

mugo_muiruri_james

2023-09-18