R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

1. Load the Data

First, we’ll read the data from the provided Excel file.

# Read the Excel file
if (!require(readxl)) install.packages("readxl", repos = "https://cloud.r-project.org/")
## Loading required package: readxl
# Load the library
library(readxl)

data <- read_excel("/Users/taivan.jargalan/Documents/wage_det.xlsx")

#basic information on numerical data
Stat<-c("Minimum", "Maximum", "Mean")
wage <- c(min(data$wage), max(data$wage),mean(data$wage))
education <- c(min(data$education), max(data$education),mean(data$education))
experience<-c(min(data$experience), max(data$experience),mean(data$experience))
basic_info <-data.frame(Stat, wage, education, experience)
basic_info
##      Stat       wage education experience
## 1 Minimum    50.0500   0.00000   -4.00000
## 2 Maximum 18777.2000  18.00000   63.00000
## 3    Mean   603.7268  13.06787   18.19993
#basic info on qualitative data
white_count<-sum(data$ethnicity=="cauc")
black_count<-sum(data$ethnicity=="afam")
percent_white<-white_count/(white_count+black_count)

parttime<-sum(data$parttime=="yes")
fulltime<-sum(data$parttime=="no")
percent_parttime<-parttime/(parttime+fulltime)
percent_parttime
## [1] 0.0896466
region_types<-unique(data$region)
region_types
## [1] "northeast" "midwest"   "south"     "west"
#regression
data$ethnicity2<-1 
data$ethnicity2[data$ethnicity == "afam"] <- 0

data$region2<-0  
data$region2[data$region=="northeast"]<-1
data$region2[data$region=="midwest"]<-2
data$region2[data$region=="south"]<-3
data$region2[data$region=="west"]<-4

data$parttime2<-0
data$parttime2[data$parttime=="yes"]<-1

regression<-lm(data$wage~data$education+data$experience+data$ethnicity2+data$region2+data$parttime2)
summary(regression)
## 
## Call:
## lm(formula = data$wage ~ data$education + data$experience + data$ethnicity2 + 
##     data$region2 + data$parttime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1078.5  -209.5   -50.4   136.1 18111.2 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -403.3769    16.0933  -25.07   <2e-16 ***
## data$education    58.5198     0.8564   68.33   <2e-16 ***
## data$experience    9.7425     0.1901   51.26   <2e-16 ***
## data$ethnicity2  120.9292     8.7923   13.75   <2e-16 ***
## data$region2      -5.7102     2.2221   -2.57   0.0102 *  
## data$parttime2  -356.0552     8.3353  -42.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 397.3 on 28149 degrees of freedom
## Multiple R-squared:  0.2327, Adjusted R-squared:  0.2326 
## F-statistic:  1708 on 5 and 28149 DF,  p-value: < 2.2e-16
#graph
#wage depending on education
plot(data$education, data$wage,
     main = "Wage vs Education",
     xlab = "Education",
     ylab = "Wage",
     pch = 19,
     col = "black")
trend<-lm(data$wage~data$education)
abline(trend,col="blue")

#histogram of wage by race
black_wage<-data$wage[data$ethnicity=="afam"]
white_wage<-data$wage[data$ethnicity=="cauc"]
hist(black_wage, col = "blue", probability=TRUE, xlim = range(0:4000),
     main = "Wage for African Americans",
     xlab = "Wage", ylab = "Frequency", 
     border = "black", breaks = 20)

hist(white_wage, col = "red", probability=TRUE, xlim = range(0:4000),
     main = "Wage for Caucasians",
     xlab = "Wage", ylab = "Frequency", 
     border = "black", breaks = 100)

mean(black_wage)
## [1] 446.8526
mean(white_wage)
## [1] 617.2339
#wage by part time or full time
parttime_wage<-data$wage[data$parttime=="yes"]
fulltime_wage<-data$wage[data$parttime=="no"]
hist(black_wage, col = "yellow", probability=TRUE, xlim = range(0:4000),
     main = "Wage for Part Time",
     xlab = "Wage", ylab = "Frequency", 
     border = "black", breaks = 20)

hist(white_wage, col = "green", probability=TRUE, xlim = range(0:4000),
     main = "Wage for Fulltime",
     xlab = "Wage", ylab = "Frequency", 
     border = "black", breaks = 100)

mean(parttime_wage)
## [1] 233.7263
mean(fulltime_wage)
## [1] 640.1625