library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
This is an individual assignment. Submit your .Rmd and “knitted”.html files through Collab.
Upload your html file on RPubs and include the link when you submit your submission files on Collab.
Please don’t use ggplot2 for this assignment. We’ll use ggplot2 almost all the times after this assignment.
Use the occupational experience variable (“oexp”) of the income_example dataset and plot
You can either produce four separate but small plots, or you can use par(mfrow = c(2, 2)) to create a plotting region consisting of four subplots.
Briefly describe the distributions of occupational experience in words (also include your plots and the R syntax).
[“Play” with the hist() and density() functions; for instance, by choosing a different number of bins or different break points for the hist() function, or different bandwidths using the adjust argument in density(). See also the corresponding help files and the examples given there. Only include the histogram and density estimate you find most informative. Also, add useful axis-labels and a title using the following arguments inside the plotting functions: xlab, ylab, main. Use the help ?par() for the description of many more plotting parameter.]
[That’s how the plots could look like – but you have to do it with your group;-)]
#Loading data
incomeTab = read.table("income_exmpl.dat")
head(incomeTab)
## sex age edu occ oexp income
## 1 f 62 low low 35 953
## 2 m 32 high high 6 1224
## 3 m 56 med. high 36 1466
## 4 f 63 med. med. 38 1339
## 5 m 20 low low 3 1184
## 6 f 38 med. med. 12 1196
occ <- incomeTab$occ
sex <- incomeTab$sex
# change order of factor levels
incomeTab$occ <- factor(incomeTab$occ, levels = c('low', 'med.', 'high'))
incomeTab$edu <- factor(incomeTab$edu, levels = c('low', 'med.', 'high'))
incomeTab$sex <- factor(incomeTab$sex, levels = c('m', 'f'), labels = c('male', 'female'))
#Histogram Plots This is a histogram of bins of 5 years of experience. I believe this distribution demonstrates that from the years 5 to 35 there is little decrease in experience. This stagation could be inferred that most people work 35 years, and then a significant portion will start retire in the following years. In converse, the first 5 years there is a influx of people.
oexp <-incomeTab$oexp
hist(oexp, main='Histogram of Occ. Experience', xlab='Occ. Experience')
A distribution with density does show the same data as above, but density is a better metric than frequency in this situation.
hist(oexp, main='Histogram of Occ. Experience Density', xlab='Occ. Experience', prob=T)
#Kernel Density Estimate This density estimate outlines the curve of occupational experience, which can show the lifecycle of employees careers. This density outlines how evenly spread out the workplace is.
plot(density(incomeTab$oexp), main='Density Estimate Distribution of Occ. Experience', xlab = 'Occ. Experience')
#Box plot This box plot shows that 50 percent of the work force has between 10 and 30 years of experience and vice versa for 0-10 and 30+.
quantile(oexp, prob=c(0,0.25,.5,.75,1))
## 0% 25% 50% 75% 100%
## 0 8 19 30 48
quantile(oexp, prob= seq(.0,1, by=.2))
## 0% 20% 40% 60% 80% 100%
## 0 6 15 23 32 48
boxplot(oexp, horizontal = T, main="Boxplot of Occ. Experience", xlab='Occ. Experience (in years)')
#A set of box plots The sets of box plots show that male occupation doesn’t change with age, while the female plots show a large dip in female med. In addition female low and high both have higher occupation experience than the male box plots. The low female mid could represent women staying at home to take of their children.
boxplot(oexp ~ occ + sex, col = rep(c('blue', 'red'), each = 3), ylab = 'Occ. Experience', xlab='', horizontal=F, names = c('male\nlow', 'male\nmed', 'male\nhigh', 'female\nlow', 'female\nmed', 'female\nhigh'), main = 'Occ. Experience by Sex and Occ. Status')
jitter() or alpha() for avoiding overlying points.library(haven)
scs <- read_sav('SCS_QE.sav')
scs
## # A tibble: 210 × 31
## vocabpre mathpre numbmath likemath likelit preflit pextra pagree pconsc pemot
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 7 2 6 7 2 16 39 35 29
## 2 26 3 2 2 10 3 22 41 35 29
## 3 17 5 1 3 8 3 31 39 39 29
## 4 23 4 2 8 10 2 22 46 34 33
## 5 23 5 2 2 7 3 29 48 48 40
## 6 28 7 2 2 9 3 28 43 34 36
## 7 12 5 4 6 6 1 31 32 36 33
## 8 25 10 3 4 3 2 23 36 32 15
## 9 17 9 1 7 2 1 20 50 40 30
## 10 23 8 2 4 7 3 34 42 33 31
## # … with 200 more rows, and 21 more variables: pintell <dbl>, mars <dbl>,
## # beck <dbl>, rq <dbl+lbl>, vm <dbl+lbl>, cauc <dbl+lbl>, afram <dbl+lbl>,
## # other <dbl>, age <dbl>, male <dbl+lbl>, married <dbl+lbl>, parents <dbl>,
## # momdegr <dbl>, daddegr <dbl>, credit <dbl>, majormi <dbl+lbl>,
## # actcomp <dbl>, hsgpaar <dbl>, collgpaa <dbl>, vocaball <dbl>, mathall <dbl>
mathpre <- scs$mathpre
mars <- scs$mars
Scatter Plot
plot((mathpre), (mars), xlab = 'Math Pretest', ylab = 'Mathematics Anxiety Rating Scale', cex = .4, pch = 16)
Scatter Plot using jitter
plot(jitter(mathpre), jitter(mars), main = 'Scatter Plot using Fitter', xlab = 'Math Pretest', ylab = 'Mathematics Anxiety Rating Scale', cex = .4, pch = 16)
Scatter Plot with a jitter factor of 3
plot(jitter(mathpre, factor = 3), main='Scatter Plot with a Jitter Factor of 3', jitter(mars, factor = 3), xlab = 'Math Pretest', ylab = 'Mathematics Anxiety Rating Scale', cex = .4, pch = 16)
| male” in your first argument to create a conditioning plot.coplot(mathpre ~ mars | male, data = scs, cex=.6, xlab = 'Mathematics Anxiety Rating Scale', ylab = 'Math Pretest', main='Math Pretest by Mathematics Anxiety Rating Scale separated by Gender(M/F)')
I don’t believe that Simpson’s Paradox exists because between the male and female data the scatter plot doesn’t show any difference in trends. As the Math Pretest scores increases, the Mathematics Anxiety Rating Score decreases. This trend is the same for males and females. With the exception that female data is more spread out between the two extremes.
Use a dataset that is available in data repositories (e.g., kaggle)
Briefly describe the dataset you’re using (e.g., means to access data, context, sample, variables, etc…)
Re-do Part 2, i.e.,
jitter() or alpha() for avoiding overlying points.| C” in your first argument to create a conditioning plot.netflix = read.csv("NetflixOriginals.csv")
target <- c("English", "Hindi")
netflixLang <- filter(netflix, Language %in% target)
head(netflix)
## Title Genre Premiere Runtime IMDB.Score
## 1 Enter the Anime Documentary August 5, 2019 58 2.5
## 2 Dark Forces Thriller August 21, 2020 81 2.6
## 3 The App Science fiction/Drama December 26, 2019 79 2.6
## 4 The Open House Horror thriller January 19, 2018 94 3.2
## 5 Kaali Khuhi Mystery October 30, 2020 90 3.4
## 6 Drive Action November 1, 2019 147 3.5
## Language
## 1 English/Japanese
## 2 Spanish
## 3 Italian
## 4 English
## 5 Hindi
## 6 Hindi
runtime <- netflixLang$Runtime
imdb <- netflixLang$IMDB.Score
lang <- netflixLang$Language
genre <- netflixLang$Genre
lang
## [1] "English" "Hindi" "Hindi" "English" "English" "Hindi" "English"
## [8] "English" "Hindi" "English" "English" "English" "English" "English"
## [15] "Hindi" "English" "English" "English" "English" "English" "English"
## [22] "English" "English" "English" "Hindi" "English" "English" "English"
## [29] "English" "English" "English" "English" "English" "English" "English"
## [36] "English" "English" "English" "English" "English" "English" "English"
## [43] "English" "English" "English" "English" "English" "English" "English"
## [50] "English" "English" "English" "English" "English" "Hindi" "English"
## [57] "English" "English" "English" "Hindi" "English" "Hindi" "English"
## [64] "English" "English" "English" "English" "English" "English" "Hindi"
## [71] "English" "English" "English" "English" "English" "English" "English"
## [78] "English" "English" "English" "English" "English" "English" "English"
## [85] "English" "English" "English" "English" "English" "English" "English"
## [92] "English" "English" "English" "English" "Hindi" "English" "English"
## [99] "English" "English" "English" "English" "English" "English" "English"
## [106] "English" "English" "English" "English" "English" "English" "English"
## [113] "English" "Hindi" "Hindi" "English" "English" "English" "English"
## [120] "English" "English" "Hindi" "English" "English" "English" "English"
## [127] "English" "English" "English" "English" "English" "English" "English"
## [134] "English" "English" "English" "English" "English" "English" "English"
## [141] "Hindi" "English" "English" "English" "English" "English" "English"
## [148] "Hindi" "English" "English" "English" "English" "English" "English"
## [155] "English" "English" "English" "English" "English" "English" "English"
## [162] "English" "English" "English" "English" "English" "English" "English"
## [169] "English" "English" "English" "English" "English" "English" "Hindi"
## [176] "English" "English" "English" "English" "English" "English" "English"
## [183] "English" "English" "English" "English" "English" "English" "English"
## [190] "English" "English" "English" "English" "English" "English" "English"
## [197] "Hindi" "English" "English" "English" "English" "English" "English"
## [204] "English" "English" "English" "English" "English" "English" "English"
## [211] "English" "English" "English" "English" "English" "English" "English"
## [218] "English" "English" "English" "English" "English" "English" "English"
## [225] "English" "English" "English" "English" "English" "English" "English"
## [232] "English" "Hindi" "English" "English" "English" "English" "English"
## [239] "English" "Hindi" "English" "English" "English" "English" "English"
## [246] "English" "English" "English" "English" "Hindi" "English" "English"
## [253] "English" "Hindi" "English" "English" "English" "English" "English"
## [260] "English" "English" "Hindi" "English" "English" "English" "English"
## [267] "English" "English" "English" "English" "English" "English" "English"
## [274] "English" "English" "English" "English" "Hindi" "English" "English"
## [281] "English" "English" "English" "Hindi" "English" "English" "English"
## [288] "English" "English" "English" "Hindi" "English" "English" "English"
## [295] "English" "Hindi" "English" "English" "English" "English" "English"
## [302] "English" "English" "Hindi" "English" "English" "English" "English"
## [309] "English" "English" "English" "English" "English" "English" "English"
## [316] "English" "English" "English" "English" "English" "English" "English"
## [323] "English" "English" "English" "English" "English" "English" "English"
## [330] "English" "English" "English" "English" "English" "English" "English"
## [337] "English" "English" "English" "English" "English" "English" "English"
## [344] "English" "English" "English" "English" "English" "English" "English"
## [351] "English" "English" "English" "English" "English" "English" "Hindi"
## [358] "English" "English" "Hindi" "English" "English" "English" "English"
## [365] "English" "English" "English" "English" "English" "English" "English"
## [372] "Hindi" "English" "English" "English" "English" "English" "English"
## [379] "English" "English" "English" "English" "English" "English" "English"
## [386] "English" "English" "English" "English" "English" "English" "English"
## [393] "English" "English" "English" "English" "English" "English" "English"
## [400] "Hindi" "English" "English" "English" "English" "English" "English"
## [407] "English" "Hindi" "English" "English" "English" "English" "English"
## [414] "English" "English" "English" "English" "English" "English" "English"
## [421] "English" "English" "English" "English" "English" "English" "English"
## [428] "English" "English" "English" "English" "English" "English" "English"
Scatter Plot
plot((runtime), (imdb), main = 'Scatter Plot of Netflix Originals Runtime by IMDB Score', xlab = 'Movie Runtime', ylab = 'IMDB', cex = .4, pch = 16)
Scatter Plot with Jitter
plot(jitter(runtime), jitter(imdb), main = 'Scatter Plot of Netflix Originals Runtime by IMDB Score', xlab = 'Movie Runtime', ylab = 'IMDB', cex = .4, pch = 16)
Scatter Plot with Jitter 3
plot(jitter(runtime, factor=3), jitter(imdb, factor=3), main = 'Scatter Plot of Netflix Originals Runtime by IMDB Score', xlab = 'Movie Runtime', ylab = 'IMDB', cex = .4, pch = 16)
#Coordinating Plot
coplot(runtime ~ imdb | lang, data = netflixLang, cex=.6, xlab = 'Mathematics Anxiety Rating Scale', ylab = 'Math Pretest', main='')
#Simpson’s Rule The data doesn’t show Simpson’s Paradox present since both Hindi and English Movies show a concentration of higher imdb scores in the central area of the graph. However, English shows have consistently more higher scoring movies with a lower runtime.