Data_Analysis_and_Decision_Making

require(faraway)

## Loading required package: faraway

require(ggplot2)

## Loading required package: ggplot2

require(GGally)

## Loading required package: GGally

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:faraway':
## 
##     happy

require(gridExtra)

## Loading required package: gridExtra

require(e1071)

## Loading required package: e1071

head(uswages,10)

##         wage educ exper race smsa ne mw so we pt
## 6085  771.60   18    18    0    1  1  0  0  0  0
## 23701 617.28   15    20    0    1  0  0  0  1  0
## 16208 957.83   16     9    0    1  0  0  1  0  0
## 2720  617.28   12    24    0    1  1  0  0  0  0
## 9723  902.18   14    12    0    1  0  1  0  0  0
## 22239 299.15   12    33    0    1  0  0  0  1  0
## 14379 541.31   16    42    0    1  0  0  1  0  1
## 12878 148.39   16     0    0    1  0  1  0  0  1
## 23121 273.19   12    36    0    1  0  0  0  1  1
## 13086 666.67   12    37    0    0  0  1  0  0  0

summary(uswages)

##       wage              educ           exper            race      
##  Min.   :  50.39   Min.   : 0.00   Min.   :-2.00   Min.   :0.000  
##  1st Qu.: 308.64   1st Qu.:12.00   1st Qu.: 8.00   1st Qu.:0.000  
##  Median : 522.32   Median :12.00   Median :15.00   Median :0.000  
##  Mean   : 608.12   Mean   :13.11   Mean   :18.41   Mean   :0.078  
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00   3rd Qu.:0.000  
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00   Max.   :1.000  
##       smsa             ne              mw               so        
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.756   Mean   :0.229   Mean   :0.2485   Mean   :0.3125  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##        we             pt        
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000  
##  Mean   :0.21   Mean   :0.0925  
##  3rd Qu.:0.00   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000

#We see that exper has negative values.
uswages$exper[uswages$exper < 0] = NA
summary(uswages$exper)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    8.00   16.00   18.74   27.00   59.00      33

#Convert categorical variables to factors
uswages$race = factor(uswages$race)
levels(uswages$race) = c("White", "Black")

uswages$smsa = factor(uswages$smsa)
levels(uswages$smsa) = c("No", "Yes")

uswages$pt = factor(uswages$pt)
levels(uswages$pt) = c("No", "Yes")


#Convert set of dummy variables to one variable
uswages = data.frame(uswages, region = 1*uswages$ne + 2*uswages$mw + 3*uswages$so + 4*uswages$we)
uswages$region = factor(uswages$region)
levels(uswages$region) = c("ne", "mw", "so", "we")


#Deleting four regions ne, mw, so and we 
uswages = subset(uswages, select = -c(ne:we))


#Take care of NA's
uswages = na.omit(uswages)

#5 - Number Summary
summary(uswages)

##       wage              educ           exper          race       smsa     
##  Min.   :  50.39   Min.   : 0.00   Min.   : 0.00   White:1812   No : 483  
##  1st Qu.: 314.69   1st Qu.:12.00   1st Qu.: 8.00   Black: 155   Yes:1484  
##  Median : 522.32   Median :12.00   Median :16.00                          
##  Mean   : 613.99   Mean   :13.08   Mean   :18.74                          
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00                          
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00                          
##    pt       region  
##  No :1802   ne:448  
##  Yes: 165   mw:488  
##             so:616  
##             we:415  
##                     
##

#Analysis:
#If Mean and Median are unequal, skewness is present.
#Skewed - Wage
#Not Skwed - Educ and Exper
#In Race, smsa, pt and region there is no concept of skewness because it has binary values.
#There is an unbalanced counts for race, smsa and pt.
#This would tend to weaken the strength of a factor to predict the wages.


#Correlation 
cor(uswages$wage, uswages$educ)

## [1] 0.2616368

#There is a weak positive correlation between wage and educ. 
cor(uswages$wage, uswages$exper)

## [1] 0.1694355

#There is a weak positive correlation between wage and exper. 
cor(uswages$educ,uswages$exper)

## [1] -0.2934846

#There is a weak negative correlation between educ and exper. 


#Distribution of wages
m = mean(uswages$wage, na.rm = TRUE)
std = sd(uswages$wage, na.rm = TRUE)
n = length(uswages$wage)
p = 1:n/(n+1)
oldpar = par(mfrow = c(2,2))
hist(
  uswages$wage, 
  density = 20, 
  breaks = 20,
  freq = FALSE,
  prob=TRUE, 
  xlab = "uswages$wage",
  main = "Normal curve over histogram")
curve(
  dnorm(x, mean = m, sd = std),
  col = "darkblue", 
  lwd = 0.25, 
  add=TRUE)
plot(
  density(uswages$wage),
  main = "Normal curve overlay")
curve(
  dnorm(x, mean = m, sd = std), 
  col = "darkblue", 
  lwd = 0.25, 
  add = TRUE)
plot(
  p, 
  sort(uswages$wage), 
  pch = ".", 
  cex = 2,
  main = "Sort plot w/ normal curve overlay")
curve(
  qnorm(x, mean = m, sd = std), 
  col = "darkblue", 
  lwd = 0.25, 
  add = TRUE)
qqnorm(
  uswages$wage, 
  pch = ".", 
  cex = 2, 
  main = "Normal Probability QQ Plot")
qqline(uswages$wage)

#Analysis:
#There are outliers present.
#From the density graph, it is clear that wage is positively skewed.
#Majority of the wage lies between 50 to 2000.


#Boxplots
plot(wage ~ pt, data = uswages)
#Analysis:
#The wages for part time are not that spread out compared to the people who are working full time.
#People doing full time have more wages compared to people working part time.

plot(wage ~ region, data = uswages)
#Analysis:
#The max wage value & the interquartile range for the people living in we(West) is slightly more compared to the rest of the regions.
#There are outliers present in all the regions except for mw(Middle West).

plot(wage ~ smsa, data = uswages)
#Analysis:
#The max wage value & the interquartile range for the people living in smsa(Standard Metropolitan Statistical Area) is slightly more compared to the people not living in smsa.
#There are outliers present in boxplot for the people living in smsa(Standard Metropolitan Statistical Area)

plot(wage ~ race, data = uswages)

#Analysis:
#Whites earn more compared to blacks.
#There are outliers present in whites.


#Experience vs Wage with respect to Race
ggplot(uswages,aes(x = exper, y = wage, shape = race, na.rm ='TRUE')) +geom_point() +facet_grid(~ race)

#Analysis:
#We observe that compared to blacks there are more no of whites. Apart from that what we observe that the wages of whites is also higher compared to black.


#Education vs Wage with respect to Race
ggplot(uswages, aes(educ, wage, shape = race, na.rm = 'TRUE')) +geom_point() +facet_grid(~race)

#Analysis:
#The distribution of education in whites is spread out compared to blacks and whites receive more wages.


#Part time vs Wage with respect to Race
ggplot(uswages,aes(x=pt,y= wage, shape= race, na.rm='TRUE'))+geom_point()+facet_grid(~ race)

#Analysis:
#Whites who dont work in Part Time are more in numbers compared to blacks and earn more wages.
#This statement holds good even for part time


#Education vs Wage with respect to Part time
ggplot(uswages, aes(educ, wage)) +facet_grid(~pt) +geom_point() +geom_jitter()

#Analysis:
#People who are not working as part time are more and their wages are more spread out compared to people who are working as part time.
#People with no part time are distributed from 0 to 18 years of education. 


#Experience vs Wage with respect to Part time
ggplot(uswages, aes(exper, wage)) +geom_point() +facet_grid(~pt)

#Analysis:
#People who are not working as part time are more and their wages are more spread out compared to people who are working as part time.
#People with no part time are distributed from 0 to 50 years of experience. 


#Scatter plots for all the attributes
ggpairs(uswages, columns = c(1:3))

#Analysis:
#Correlation between wage and educ is 0.26 which shows a weak positive linear relationship. 
#Correlation between wage and exper is 0.16 which shows a weak positive linear relationship.
#Correlation between educ adn exper is -0.29 which shows a weak negative linear relationsip.

pairs(~ wage + educ + exper, data = uswages, col = uswages$race)

ggpairs(uswages, mapping = aes(colour = race))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Analysis 1: Skewness
skewness(uswages$wage)

## [1] 3.833774

skewness(uswages$educ)

## [1] -0.6824895

skewness(uswages$exper)

## [1] 0.6671678

#For wages, the graph is highly distributed and the data is positively skewed i.e towards the right.
#For educ, the graph is moderately distributed and the data is negatively skewed i.e towards the left.
#For exper, the graph is moderately distributed and the data is positively skewed i.e towards the right.

#Analysis 2: Wages vs Factor variables
#wage vs race - Whites earn more than the black do and the spread of wages for whites is more.
#wage vs smsa - The count and the wages of whites in smsa is very high compared the blacks. The count of whites not living in smsa is high compared to blacks but the wages are almost similar.
#wage vs pt - The count and the wages of whites who work part-time is very high compared the blacks.
#wages vs region - The wages for all the people living in different regions is almost the same.

#Analysis 3: Educ vs Factor variables
#educ vs race - Whites are more educated than blacks.
#educ vs smsa - Whites staying in smsa are more educated than blacks. Whites not staying in smsa are more educated than blacks.
#educ vs pt - There are more number of blacks who are educated and are working as part time compared to whites. Whites not working as part time are more educated than blacks.
#educ vs region - In all the region Whites are more educated than blacks

#Analysis 4: Exper vs Factor variables
#exper vs race - There are almost equal number of whites and blacks who have same years of experience.
#exper vs smsa - Whites living in smsa and not living in smsa an are slightly more experienced than blacks.
#exper vs pt - Whites working as part time have much more experience than blacks. Whites and blacks have almost the same experience for those who are not working as part time.
#exper vs region - In all the region Whites are more experienced than blacks apart from the ones staying in Middle West where the whites and  blacks have almost the same experience.


ggpairs(uswages, lower = list(continuous = "smooth", combo = "facetdensity", mapping = aes(color = race)))

#Analysis:
#The slope coefficient is based on two predictors - educ and exper.


#Fit Model
fit = lm(wage ~ educ + exper, uswages)
summary(fit)

## 
## Call:
## lm(formula = wage ~ educ + exper, data = uswages)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1014.7  -235.2   -52.1   150.1  7249.2 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -239.1146    50.7111  -4.715 2.58e-06 ***
## educ          51.8654     3.3423  15.518  < 2e-16 ***
## exper          9.3287     0.7602  12.271  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 426.8 on 1964 degrees of freedom
## Multiple R-squared:  0.1348, Adjusted R-squared:  0.1339 
## F-statistic:   153 on 2 and 1964 DF,  p-value: < 2.2e-16

#Analysis:
#The slope coefficient are positive. So with increase in education and experience there will be increase in wages.

deviance = deviance(fit)
deviance

## [1] 357763806

y = uswages$wage
totalss = sum((y-mean(y))^2)
totalss

## [1] 413500833

1 - deviance/totalss

## [1] 0.134793

summary(fit)$r.square

## [1] 0.134793

#Analysis:
#The model is not a good fit because the value of R^2 is 0.13 which is much less than 1.
#13% of the variance in the response variable can be explained by the explanatory variables. The remaining 87% can be attributed to unknown, lurking variables or inherent variability

c(summary(fit)$r.square, cor(fitted.values(fit), uswages$wage)^2)

## [1] 0.134793 0.134793

#Analysis:
#The pearson correlation is equal to model summary.


#Fit Plot
qplot(fit$fitted.value, wage, data = uswages) +geom_abline(intercept = 0, slope = 1, color="green") +ggtitle("Fit Plot")

#Analysis:
#It is not a good fit plot

#Residual Plot
ggplot(fit, aes(.fitted, .resid)) +geom_point() +geom_hline(yintercept = 0, color = "red", linetype = "dashed") + ggtitle("Residual Plot")

#Analysis:
#The points in a residual plot are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data.


#Exploring model structure
cor(fit$resid, uswages$wage)

## [1] 0.930165

#Analysis:
#As the correlation is high, it indicates that there is some trouble with our model.

plot1 = qplot(race, fit$resid, geom = "boxplot", data = uswages) +geom_hline(yintercept = 0, color = "dark blue", linetype = "dashed")
plot2 = qplot(smsa, fit$resid, geom = "boxplot", data = uswages) +geom_hline(yintercept = 0, color = "dark blue", linetype = "dashed")
plot3 = qplot(pt, fit$resid, geom = "boxplot", data = uswages) +geom_hline(yintercept = 0, color = "dark blue", linetype = "dashed")
plot4 = qplot(region, fit$resid, geom = "boxplot", data = uswages) +geom_hline(yintercept = 0, color = "dark blue", linetype = "dashed")
plot5 = qplot(educ, fit$resid, data = uswages) +geom_hline(yintercept = 0, color = "dark blue", linetype = "dashed")
plot6 = qplot(exper, fit$resid, data = uswages) +geom_hline(yintercept = 0, color = "dark blue", linetype = "dashed")
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, nrow = 3)

#Analysis:
#We see prounanced patters indicating we do not need to include square of the predictors or other transforms of the predictors.


#Normailty of the Residual
mod = fortify(fit)
plot7 = qplot(.stdresid, data = mod, geom = "histogram")
plot8 = qplot(.stdresid, data = mod, geom = "density")
plot9 = qplot(sample = .stdresid, data = mod, geom = "qq") +geom_abline()
grid.arrange(plot7, plot8, plot9, nrow = 1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Analysis:
#We see that the residual do not look as though they come from a normal distribution.

Data_Analysis_and_Decision_Making_8.R

aksha

Mon Oct 09 03:12:24 2017