setwd("C:/Users/malia/OneDrive/Desktop/MSDS R Bridge")
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(foreign)
library(descr)
library(expss)
##
## Use 'expss_output_viewer()' to display tables in the RStudio Viewer.
## To return to the console output, use 'expss_output_default()'.
##
## Attaching package: 'expss'
## The following objects are masked from 'package:magrittr':
##
## and, equals, or
## The following objects are masked from 'package:stringr':
##
## fixed, regex
## The following objects are masked from 'package:dplyr':
##
## between, compute, contains, first, last, na_if, recode, vars
## The following objects are masked from 'package:purrr':
##
## keep, modify, modify_if, transpose, when
## The following objects are masked from 'package:tidyr':
##
## contains, nest
## The following object is masked from 'package:ggplot2':
##
## vars
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 4.0.3
## Registered S3 methods overwritten by 'Hmisc':
## method from
## [.labelled expss
## print.labelled expss
## as.data.frame.labelled expss
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggplot2)
dat<-read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/AER/CPS1985.csv")
summary(dat$wage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.250 7.780 9.024 11.250 44.500
boxplot(dat$wage)
histogram(dat$wag,emain = "Histogram 1.1",xlab = "Wage exploration",col = "green")
### Created a new variable named social class using years of education for the Data Wragling purpose.
dat$socclass<- cut(dat$education, c(0,11,12,99), labels=c("high school dropout","highschool graduate","at least some college"))
The corelation coefficient analysis projects that there is a positive corelation between wage and education.Positive corelation between wage and experiance has also been observed at a 99% confidence interval.
correlate<-data.frame(dat$experience,dat$wage)
correlate2<-rcorr(as.matrix(correlate))
correlate2
## dat.experience dat.wage
## dat.experience 1.00 0.09
## dat.wage 0.09 1.00
##
## n= 534
##
##
## P
## dat.experience dat.wage
## dat.experience 0.0443
## dat.wage 0.0443
correlate<-data.frame(dat$education,dat$wage)
correlate2<-rcorr(as.matrix(correlate))
correlate2
## dat.education dat.wage
## dat.education 1.00 0.38
## dat.wage 0.38 1.00
##
## n= 534
##
##
## P
## dat.education dat.wage
## dat.education 0
## dat.wage 0
### number has been assigned to categorical variable for the purpose of analysis. 1 = "yes"(If the partcipant belongs to a union),2 = "no"(if the participant does not belong to a union.)
dat$union= factor(dat$union,
levels = c('yes','no'),
labels = c(1,2))
##ANOVA analysis has been conducted to see if the partcipants who belong to a union earn more than those who do not. The result portrays union members earn more than those who do not belong to a union.
one.way <- aov(wage ~ union, data = dat)
summary(one.way)
## Df Sum Sq Mean Sq F value Pr(>F)
## union 1 368 368.4 14.3 0.000174 ***
## Residuals 532 13708 25.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Aggregate function: calculating means by group
aggregate(x = dat$wage,
by = list(dat$union),
FUN = mean,
na.rm=TRUE)
## Group.1 x
## 1 1 10.798125
## 2 2 8.635228
histogram(dat$union,main = "Histogram 1.2",xlab = "Trends for union members",col = "blue")