library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1 Instructions

  • This is an individual assignment. Submit your .Rmd and “knitted”.html files through Collab.

  • Upload your html file on RPubs and include the link when you submit your submission files on Collab.

  • Please don’t use ggplot2 for this assignment. We’ll use ggplot2 almost all the times after this assignment.

2 Part 1

  • Use the occupational experience variable (“oexp”) of the income_example dataset and plot

    • a histogram,
    • a kernel density estimate,
    • a boxplot of “oexp”,
    • a set of boxplots showing the distribution of “oexp” by sex crossed with occupational status (“occ”).
  • You can either produce four separate but small plots, or you can use par(mfrow = c(2, 2)) to create a plotting region consisting of four subplots.

  • Briefly describe the distributions of occupational experience in words (also include your plots and the R syntax).

[“Play” with the hist() and density() functions; for instance, by choosing a different number of bins or different break points for the hist() function, or different bandwidths using the adjust argument in density(). See also the corresponding help files and the examples given there. Only include the histogram and density estimate you find most informative. Also, add useful axis-labels and a title using the following arguments inside the plotting functions: xlab, ylab, main. Use the help ?par() for the description of many more plotting parameter.]

[That’s how the plots could look like – but you have to do it with your group;-)]

#Loading data

incomeTab = read.table("income_exmpl.dat")
head(incomeTab)
##   sex age  edu  occ oexp income
## 1   f  62  low  low   35    953
## 2   m  32 high high    6   1224
## 3   m  56 med. high   36   1466
## 4   f  63 med. med.   38   1339
## 5   m  20  low  low    3   1184
## 6   f  38 med. med.   12   1196
occ <- incomeTab$occ
sex <- incomeTab$sex
# change order of factor levels
incomeTab$occ <- factor(incomeTab$occ, levels = c('low', 'med.', 'high'))
incomeTab$edu <- factor(incomeTab$edu, levels = c('low', 'med.', 'high'))
incomeTab$sex <- factor(incomeTab$sex, levels = c('m', 'f'), labels = c('male', 'female'))

#Histogram Plots This is a histogram of bins of 5 years of experience. I believe this distribution demonstrates that from the years 5 to 35 there is little decrease in experience. This stagation could be inferred that most people work 35 years, and then a significant portion will start retire in the following years. In converse, the first 5 years there is a influx of people.

oexp <-incomeTab$oexp
hist(oexp, main='Histogram of Occ. Experience', xlab='Occ. Experience')

A distribution with density does show the same data as above, but density is a better metric than frequency in this situation.

hist(oexp, main='Histogram of Occ. Experience Density', xlab='Occ. Experience', prob=T)

#Kernel Density Estimate This density estimate outlines the curve of occupational experience, which can show the lifecycle of employees careers. This density outlines how evenly spread out the workplace is.

plot(density(incomeTab$oexp), main='Density Estimate Distribution of Occ. Experience', xlab = 'Occ. Experience')

#Box plot This box plot shows that 50 percent of the work force has between 10 and 30 years of experience and vice versa for 0-10 and 30+.

quantile(oexp, prob=c(0,0.25,.5,.75,1))
##   0%  25%  50%  75% 100% 
##    0    8   19   30   48
quantile(oexp, prob= seq(.0,1, by=.2))
##   0%  20%  40%  60%  80% 100% 
##    0    6   15   23   32   48
boxplot(oexp, horizontal = T, main="Boxplot of Occ. Experience", xlab='Occ. Experience (in years)')

#A set of box plots The sets of box plots show that male occupation doesn’t change with age, while the female plots show a large dip in female med. In addition female low and high both have higher occupation experience than the male box plots. The low female mid could represent women staying at home to take of their children.

boxplot(oexp ~ occ + sex, col = rep(c('blue', 'red'), each = 3), ylab = 'Occ. Experience', xlab='', horizontal=F, names = c('male\nlow', 'male\nmed', 'male\nhigh', 'female\nlow', 'female\nmed', 'female\nhigh'), main = 'Occ. Experience by Sex and Occ. Status')

  • describe your plots.

3 Part 2

  • Download the SCS Data set from Collab (there you also find a separate file containing a brief description of variables). Then investigate the relationship between the mathematics achievement score (“mathpre”) and the math anxiety score (“mars”) by plotting the data and the path of means.
  1. Produce a scatterplot between “mathpre” and “mars”. You might consider using jitter() or alpha() for avoiding overlying points.
library(haven)
scs <- read_sav('SCS_QE.sav')
scs
## # A tibble: 210 × 31
##    vocabpre mathpre numbmath likemath likelit preflit pextra pagree pconsc pemot
##       <dbl>   <dbl>    <dbl>    <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl> <dbl>
##  1       24       7        2        6       7       2     16     39     35    29
##  2       26       3        2        2      10       3     22     41     35    29
##  3       17       5        1        3       8       3     31     39     39    29
##  4       23       4        2        8      10       2     22     46     34    33
##  5       23       5        2        2       7       3     29     48     48    40
##  6       28       7        2        2       9       3     28     43     34    36
##  7       12       5        4        6       6       1     31     32     36    33
##  8       25      10        3        4       3       2     23     36     32    15
##  9       17       9        1        7       2       1     20     50     40    30
## 10       23       8        2        4       7       3     34     42     33    31
## # … with 200 more rows, and 21 more variables: pintell <dbl>, mars <dbl>,
## #   beck <dbl>, rq <dbl+lbl>, vm <dbl+lbl>, cauc <dbl+lbl>, afram <dbl+lbl>,
## #   other <dbl>, age <dbl>, male <dbl+lbl>, married <dbl+lbl>, parents <dbl>,
## #   momdegr <dbl>, daddegr <dbl>, credit <dbl>, majormi <dbl+lbl>,
## #   actcomp <dbl>, hsgpaar <dbl>, collgpaa <dbl>, vocaball <dbl>, mathall <dbl>
mathpre <- scs$mathpre
mars <- scs$mars

Scatter Plot

plot((mathpre), (mars),  xlab = 'Math Pretest', ylab = 'Mathematics Anxiety Rating Scale', cex = .4, pch = 16)

Scatter Plot using jitter

plot(jitter(mathpre), jitter(mars), main = 'Scatter Plot using Fitter',  xlab = 'Math Pretest', ylab = 'Mathematics Anxiety Rating Scale', cex = .4, pch = 16)

Scatter Plot with a jitter factor of 3

plot(jitter(mathpre, factor = 3), main='Scatter Plot with a Jitter Factor of 3', jitter(mars, factor = 3),  xlab = 'Math Pretest', ylab = 'Mathematics Anxiety Rating Scale', cex = .4, pch = 16)

  1. Draw a conditioning plot for female and male students (variable “male”). Include “| male” in your first argument to create a conditioning plot.
coplot(mathpre ~ mars | male, data = scs, cex=.6, xlab = 'Mathematics Anxiety Rating Scale', ylab = 'Math Pretest', main='Math Pretest by Mathematics Anxiety Rating Scale separated by Gender(M/F)')

  1. Describe in words the relation between math scores and math anxiety. Do you find evidence of Simpson’s Paradox?

I don’t believe that Simpson’s Paradox exists because between the male and female data the scatter plot doesn’t show any difference in trends. As the Math Pretest scores increases, the Mathematics Anxiety Rating Score decreases. This trend is the same for males and females. With the exception that female data is more spread out between the two extremes.

4 Part 3

  • Use a dataset that is available in data repositories (e.g., kaggle)

  • Briefly describe the dataset you’re using (e.g., means to access data, context, sample, variables, etc…)

    • describe your data.
  • Re-do Part 2, i.e.,

    • produce a scatterplot between “A” and “B”. You might consider using jitter() or alpha() for avoiding overlying points.
    • draw a scatterplot plot conditioning on variable “C”. Include “| C” in your first argument to create a conditioning plot.
    • describe in words the relation between “A” and “B.” Do you find evidence of Simpson’s Paradox?
netflix = read.csv("NetflixOriginals.csv")
target <- c("English", "Hindi")
netflixLang <- filter(netflix, Language %in% target)
head(netflix)
##             Title                 Genre          Premiere Runtime IMDB.Score
## 1 Enter the Anime           Documentary    August 5, 2019      58        2.5
## 2     Dark Forces              Thriller   August 21, 2020      81        2.6
## 3         The App Science fiction/Drama December 26, 2019      79        2.6
## 4  The Open House       Horror thriller  January 19, 2018      94        3.2
## 5     Kaali Khuhi               Mystery  October 30, 2020      90        3.4
## 6           Drive                Action  November 1, 2019     147        3.5
##           Language
## 1 English/Japanese
## 2          Spanish
## 3          Italian
## 4          English
## 5            Hindi
## 6            Hindi
runtime <- netflixLang$Runtime
imdb <- netflixLang$IMDB.Score
lang <- netflixLang$Language
genre <- netflixLang$Genre
lang
##   [1] "English" "Hindi"   "Hindi"   "English" "English" "Hindi"   "English"
##   [8] "English" "Hindi"   "English" "English" "English" "English" "English"
##  [15] "Hindi"   "English" "English" "English" "English" "English" "English"
##  [22] "English" "English" "English" "Hindi"   "English" "English" "English"
##  [29] "English" "English" "English" "English" "English" "English" "English"
##  [36] "English" "English" "English" "English" "English" "English" "English"
##  [43] "English" "English" "English" "English" "English" "English" "English"
##  [50] "English" "English" "English" "English" "English" "Hindi"   "English"
##  [57] "English" "English" "English" "Hindi"   "English" "Hindi"   "English"
##  [64] "English" "English" "English" "English" "English" "English" "Hindi"  
##  [71] "English" "English" "English" "English" "English" "English" "English"
##  [78] "English" "English" "English" "English" "English" "English" "English"
##  [85] "English" "English" "English" "English" "English" "English" "English"
##  [92] "English" "English" "English" "English" "Hindi"   "English" "English"
##  [99] "English" "English" "English" "English" "English" "English" "English"
## [106] "English" "English" "English" "English" "English" "English" "English"
## [113] "English" "Hindi"   "Hindi"   "English" "English" "English" "English"
## [120] "English" "English" "Hindi"   "English" "English" "English" "English"
## [127] "English" "English" "English" "English" "English" "English" "English"
## [134] "English" "English" "English" "English" "English" "English" "English"
## [141] "Hindi"   "English" "English" "English" "English" "English" "English"
## [148] "Hindi"   "English" "English" "English" "English" "English" "English"
## [155] "English" "English" "English" "English" "English" "English" "English"
## [162] "English" "English" "English" "English" "English" "English" "English"
## [169] "English" "English" "English" "English" "English" "English" "Hindi"  
## [176] "English" "English" "English" "English" "English" "English" "English"
## [183] "English" "English" "English" "English" "English" "English" "English"
## [190] "English" "English" "English" "English" "English" "English" "English"
## [197] "Hindi"   "English" "English" "English" "English" "English" "English"
## [204] "English" "English" "English" "English" "English" "English" "English"
## [211] "English" "English" "English" "English" "English" "English" "English"
## [218] "English" "English" "English" "English" "English" "English" "English"
## [225] "English" "English" "English" "English" "English" "English" "English"
## [232] "English" "Hindi"   "English" "English" "English" "English" "English"
## [239] "English" "Hindi"   "English" "English" "English" "English" "English"
## [246] "English" "English" "English" "English" "Hindi"   "English" "English"
## [253] "English" "Hindi"   "English" "English" "English" "English" "English"
## [260] "English" "English" "Hindi"   "English" "English" "English" "English"
## [267] "English" "English" "English" "English" "English" "English" "English"
## [274] "English" "English" "English" "English" "Hindi"   "English" "English"
## [281] "English" "English" "English" "Hindi"   "English" "English" "English"
## [288] "English" "English" "English" "Hindi"   "English" "English" "English"
## [295] "English" "Hindi"   "English" "English" "English" "English" "English"
## [302] "English" "English" "Hindi"   "English" "English" "English" "English"
## [309] "English" "English" "English" "English" "English" "English" "English"
## [316] "English" "English" "English" "English" "English" "English" "English"
## [323] "English" "English" "English" "English" "English" "English" "English"
## [330] "English" "English" "English" "English" "English" "English" "English"
## [337] "English" "English" "English" "English" "English" "English" "English"
## [344] "English" "English" "English" "English" "English" "English" "English"
## [351] "English" "English" "English" "English" "English" "English" "Hindi"  
## [358] "English" "English" "Hindi"   "English" "English" "English" "English"
## [365] "English" "English" "English" "English" "English" "English" "English"
## [372] "Hindi"   "English" "English" "English" "English" "English" "English"
## [379] "English" "English" "English" "English" "English" "English" "English"
## [386] "English" "English" "English" "English" "English" "English" "English"
## [393] "English" "English" "English" "English" "English" "English" "English"
## [400] "Hindi"   "English" "English" "English" "English" "English" "English"
## [407] "English" "Hindi"   "English" "English" "English" "English" "English"
## [414] "English" "English" "English" "English" "English" "English" "English"
## [421] "English" "English" "English" "English" "English" "English" "English"
## [428] "English" "English" "English" "English" "English" "English" "English"

Scatter Plot

plot((runtime), (imdb), main = 'Scatter Plot of Netflix Originals Runtime by IMDB Score', xlab = 'Movie Runtime', ylab = 'IMDB', cex = .4, pch = 16)

Scatter Plot with Jitter

plot(jitter(runtime), jitter(imdb), main = 'Scatter Plot of Netflix Originals Runtime by IMDB Score', xlab = 'Movie Runtime', ylab = 'IMDB', cex = .4, pch = 16)

Scatter Plot with Jitter 3

plot(jitter(runtime, factor=3), jitter(imdb, factor=3), main = 'Scatter Plot of Netflix Originals Runtime by IMDB Score', xlab = 'Movie Runtime', ylab = 'IMDB', cex = .4, pch = 16)

#Coordinating Plot

coplot(runtime ~ imdb | lang, data = netflixLang, cex=.6, xlab = 'Mathematics Anxiety Rating Scale', ylab = 'Math Pretest', main='')

#Simpson’s Rule The data doesn’t show Simpson’s Paradox present since both Hindi and English Movies show a concentration of higher imdb scores in the central area of the graph. However, English shows have consistently more higher scoring movies with a lower runtime.

  • You will present results of Part 3 to your neighbor(s) in class of Jan. 7 (Mon).