This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
First, we’ll read the data from the provided Excel file.
# Read the Excel file
if (!require(readxl)) install.packages("readxl", repos = "https://cloud.r-project.org/")
## Loading required package: readxl
# Load the library
library(readxl)
data <- read_excel("/Users/taivan.jargalan/Documents/wage_det.xlsx")
#basic information on numerical data
Stat<-c("Minimum", "Maximum", "Mean")
wage <- c(min(data$wage), max(data$wage),mean(data$wage))
education <- c(min(data$education), max(data$education),mean(data$education))
experience<-c(min(data$experience), max(data$experience),mean(data$experience))
basic_info <-data.frame(Stat, wage, education, experience)
basic_info
## Stat wage education experience
## 1 Minimum 50.0500 0.00000 -4.00000
## 2 Maximum 18777.2000 18.00000 63.00000
## 3 Mean 603.7268 13.06787 18.19993
#basic info on qualitative data
white_count<-sum(data$ethnicity=="cauc")
black_count<-sum(data$ethnicity=="afam")
percent_white<-white_count/(white_count+black_count)
parttime<-sum(data$parttime=="yes")
fulltime<-sum(data$parttime=="no")
percent_parttime<-parttime/(parttime+fulltime)
percent_parttime
## [1] 0.0896466
region_types<-unique(data$region)
region_types
## [1] "northeast" "midwest" "south" "west"
#regression
data$ethnicity2<-1
data$ethnicity2[data$ethnicity == "afam"] <- 0
data$region2<-0
data$region2[data$region=="northeast"]<-1
data$region2[data$region=="midwest"]<-2
data$region2[data$region=="south"]<-3
data$region2[data$region=="west"]<-4
data$parttime2<-0
data$parttime2[data$parttime=="yes"]<-1
regression<-lm(data$wage~data$education+data$experience+data$ethnicity2+data$region2+data$parttime2)
summary(regression)
##
## Call:
## lm(formula = data$wage ~ data$education + data$experience + data$ethnicity2 +
## data$region2 + data$parttime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1078.5 -209.5 -50.4 136.1 18111.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -403.3769 16.0933 -25.07 <2e-16 ***
## data$education 58.5198 0.8564 68.33 <2e-16 ***
## data$experience 9.7425 0.1901 51.26 <2e-16 ***
## data$ethnicity2 120.9292 8.7923 13.75 <2e-16 ***
## data$region2 -5.7102 2.2221 -2.57 0.0102 *
## data$parttime2 -356.0552 8.3353 -42.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 397.3 on 28149 degrees of freedom
## Multiple R-squared: 0.2327, Adjusted R-squared: 0.2326
## F-statistic: 1708 on 5 and 28149 DF, p-value: < 2.2e-16
#graph
#wage depending on education
plot(data$education, data$wage,
main = "Wage vs Education",
xlab = "Education",
ylab = "Wage",
pch = 19,
col = "black")
trend<-lm(data$wage~data$education)
abline(trend,col="blue")
#histogram of wage by race
black_wage<-data$wage[data$ethnicity=="afam"]
white_wage<-data$wage[data$ethnicity=="cauc"]
hist(black_wage, col = "blue", probability=TRUE, xlim = range(0:4000),
main = "Wage for African Americans",
xlab = "Wage", ylab = "Frequency",
border = "black", breaks = 20)
hist(white_wage, col = "red", probability=TRUE, xlim = range(0:4000),
main = "Wage for Caucasians",
xlab = "Wage", ylab = "Frequency",
border = "black", breaks = 100)
mean(black_wage)
## [1] 446.8526
mean(white_wage)
## [1] 617.2339
#wage by part time or full time
parttime_wage<-data$wage[data$parttime=="yes"]
fulltime_wage<-data$wage[data$parttime=="no"]
hist(black_wage, col = "yellow", probability=TRUE, xlim = range(0:4000),
main = "Wage for Part Time",
xlab = "Wage", ylab = "Frequency",
border = "black", breaks = 20)
hist(white_wage, col = "green", probability=TRUE, xlim = range(0:4000),
main = "Wage for Fulltime",
xlab = "Wage", ylab = "Frequency",
border = "black", breaks = 100)
mean(parttime_wage)
## [1] 233.7263
mean(fulltime_wage)
## [1] 640.1625