Midterm Booster

#0. [1pts] Rename this file to include your first and last name and run the following block of code to:
#reset the workspace, load the libraries, load the data, and define a function to calculate the mode.

#clear workspace
rm(list=ls())

#load libraries
install.packages("dplyr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

install.packages("ggplot2")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

invisible(library(dplyr))

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

#load data
data <- read.csv("http://tinyurl.com/dida325midtermdata", stringsAsFactors = F, fileEncoding="UTF-8-BOM")
data <- na.omit(data)

#define custom mode finding function (j_ is for Jacopo so it doesn't clash with existing stuff )
j_mode <- function(x) {
  as.numeric(names(sort(-table(x)))[1])
}

#1. [5pts][code] Filter out lines containing the value "total" in the job_type column and overwrite the "data" dataframe with the new reduced dataset.

data <- data %>%
  filter(!grepl("total", job_type, ignore.case = TRUE))
str(data)

## 'data.frame':    55 obs. of  8 variables:
##  $ job_type       : chr  "professional" "computer_all" "computer" "computer" ...
##  $ description    : chr  "Professional and related occupations" "Computer and mathematical occupations" "Computer systems analysts" "Information security analysts" ...
##  $ year           : int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ All            : num  36502 5603 594 137 417 ...
##  $ Women          : num  57 25.2 35.6 11.4 21.1 19.4 25.1 27.8 44.8 25.9 ...
##  $ Black          : num  10.5 9.1 9.7 11.9 6.3 6.2 12 3.7 5.9 13 ...
##  $ Asian          : num  10.1 23 18.7 6.9 28.3 34.1 29.6 16.2 9.9 10.7 ...
##  $ Hispanic.Latino: num  10.1 8.4 8.1 15.8 6.6 5.9 9.2 5.9 15.8 11.6 ...
##  - attr(*, "na.action")= 'omit' Named int [1:11] 4 17 18 25 36 37 40 53 54 56 ...
##   ..- attr(*, "names")= chr [1:11] "4" "17" "18" "25" ...

mean_salary <- mean(data$salary)

## Warning in mean.default(data$salary): argument is not numeric or logical:
## returning NA

median_salary <- median(data$salary)
j_mode <- function(x) {
  as.numeric(names(sort(-table(x)))[1])
}
cat("Mean Salary:", mean_salary, "\n")

## Mean Salary: NA

cat("Median Salary:", median_salary, "\n")

## Median Salary:

cat("Mode Salary:", j_mode(data$salary), "\n")

## Mode Salary:

#2.[10pts][code] Create a summary table following these steps:
#-p1. create a dataframe named mt_summary with 4 columns called mean,median,mode,range each having a single value of zero
#-p2. using the mean function store the mean of the data$Asian series in mt_summary$mean
#-p3. using the median function store the median of the data$Asian series in mt_summary$median
#-p4. using the j_mode function store the mode of the data$Asian series in mt_summary$mode
#-p5. using min and max calculate the range of of the data$Asian series in mt_summary$range
#-p6. print the table we just created


mt_summary <- data.frame(mean = 0, median = 0, mode = 0, range = 0)

mt_summary$mean <- mean(data$Asian)
mt_summary$median <- median(data$Asian)
mt_summary$mode <- j_mode(data$Asian)
mt_summary$range <- max(data$Asian) - min(data$Asian)
print(mt_summary)

##       mean median mode range
## 1 15.36727     12  9.6  30.7

#3.[10pts][code] Distribution plots
#-p1.using ggplot geom_histogram print the histogram plot of the data series data$Asian
#-p2.using ggplot geom_density print the distribution plot of the data series data$Asian
# Histogram plot using ggplot geom_histogram with binwidth 400

histogram_plot <- ggplot(data, aes(x = Asian)) +
  geom_histogram(binwidth = 400, fill = "skyblue", color = "black", alpha = 0.75) +
  labs(title = "Histogram of Asian Series", x = "Asian", y = "Frequency") +
  theme_minimal()

print(histogram_plot)

density_plot <- ggplot(data, aes(x = Asian)) +
  geom_density(fill = "skyblue", color = "black", alpha = 0.75) +
  labs(title = "Density Distribution of Asian Series", x = "Asian", y = "Density") +
  theme_minimal()

print(density_plot)

# For question 3, I created two visualizations of the Asian series data from the data dataframe. The first visualization is a histogram, representing the frequency distribution of the data with bins of width 400, filled with a sky blue color and set to 75% transparency. The second visualization is a density plot, displaying the probability density function of the Asian data, similarly filled with sky blue color and set to 75% transparency. Both plots are given appropriate titles and axis labels for clarity and are printed for visualization.

#4.[10pts][write plain] What kind of distribution is this?
#-p1. unimodal, multimodal, uniform?
#-p2. normal (no skew), positively skewed, negatively skewed

# The histograms and density plot indicate a positively skewed unimodal distribution. This is because the distribution has a single peak and is skewed towards higher values, with a tail extending towards the right side of the distribution.

#5.[12pts][code]Calculate the quartiles
#-p1. sort data$Asian and save sorted array in a new variable called asian_s
#-p2. divide the array in two sub arrays called asian_fh and asian_sh contaiing the first half of the sorted array and the second half of the sorted array
#-p3. find the median of the fh interval and the median of the second interval store them as q1 and q3
#-p4. calculate the IQR and save it as a variable name iqr
#-p5. using mutate add them to the mt_summary table using the names Q1,Q2,IQR
#-p6 print the updated mt_summary table

#p1
asian_s <- sort(data$Asian)

#p2
asian_fh <- asian_s[1:length(asian_s)/2]
asian_sh <- asian_s[(length(asian_s)/2 + 1):length(asian_s)]

#p3
q1 <- median(asian_fh)
q3 <- median(asian_sh)

#p4
iqr <- IQR(asian_s)

#p5
mt_summary <- mt_summary %>%
  mutate(Q1 = q1, Q3 = q3, IQR = iqr)

#p6
print(mt_summary)

##       mean median mode range  Q1   Q3   IQR
## 1 15.36727     12  9.6  30.7 9.4 19.9 11.25

#6.[5pts][code] print a geom_boxplot of the data series data$Asian (on the x axis) by job description ( on the y axis)

boxplot <- ggplot(data, aes(x = Asian, y = job_type)) +
  geom_boxplot() +
  labs(title = "Boxplot of Asian Data by Job Description",
       x = "Asian", y = "Job Description") +
  theme_minimal()

print(boxplot)

#7 [12pts][write plain] Looking at the results of questions 3-4-5-6 what can you tell me about the distribuition of asians in the workforce?
#-p1 What are the top 3 job descriptions with the most Asians? (you can count the outliers and i don't care about what year, just in absolute.make sure you remember how the box plot works, feel free to write some extra code to check and make sure you get it right, or just look at the data!)
  

# P1: 
top_jobs <- data %>%
  group_by(job_type) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  top_n(3)

## Selecting by count

# The top 3 Job Descriptions with the Most Asians are Computer, COmputer_all and Profesisonal

# P2: Range of Numbers Representing Where Most Observations Lie in the Distribution
# The range where most observations lie is typically within Q1 - 1.5*IQR to Q3 + 1.5*IQR
# The range of numbers represents where most of the observations lie in distribution is from -5 to 40

lower_range <- mt_summary$Q1 - 1.5 * mt_summary$IQR
upper_range <- mt_summary$Q3 + 1.5 * mt_summary$IQR

lower_range <- round(lower_range / 5) * 5
upper_range <- round(upper_range / 5) * 5

cat("Top 3 Job Types with the Most Asians:\n")

## Top 3 Job Types with the Most Asians:

print(top_jobs)

## # A tibble: 3 × 2
##   job_type     count
##   <chr>        <int>
## 1 computer        47
## 2 computer_all     4
## 3 professional     4

cat("\nRange of Numbers Representing Where Most Observations Lie in the Distribution:\n")

## 
## Range of Numbers Representing Where Most Observations Lie in the Distribution:

cat("Lower Range:", lower_range, "\n")

## Lower Range: -5

cat("Upper Range:", upper_range, "\n")

## Upper Range: 35

#-p3 find and paste a link to an example of a unimodal distribution
#https://www.statisticshowto.com/probability-and-statistics/descriptive-statistics/#below

#8 [10pts][code] using geom_point and geom_smooth generate a plot of the variables Asian and year  that contains the scatter plot and the regression line,
#you should know which goes on the x and which goes on the y axis
#you will need to specify the method="lm" in the geom smooth

  library(ggplot2)

# Generate plot with scatter plot and regression line
plot <- ggplot(data, aes(x = year, y = Asian)) +
  geom_point() +  # Scatter plot
  geom_smooth(method = "lm") +  # Regression line
  labs(title = "Scatter Plot and Regression Line of Asian and Year",
       x = "Year", y = "Asian") +
  theme_minimal()

print(plot)

## `geom_smooth()` using formula = 'y ~ x'

Midterm Booster

2024-03-28