#Code here
((2026-2013)/(2026-1994))*100[1] 40.625
Tools For Data Science
In this project you will be working with R base and Tidyverse.
⚠️ Students enrolled in CAP4755 should only solve: 1, 2, 3, 4, 6, 7, 8, and 9.
#Code here
((2026-2013)/(2026-1994))*100[1] 40.625
#Code here
# generate 100 random numbers
mdata=rnorm(100, mean = 0, sd = 1)
# find the sqrt
sqrt_data = sqrt(abs(mdata))
# find the mean
mean(sqrt_data)[1] 0.8849802
for-loop which runs through the whole vector. Multiply the elements which are smaller than 20 or larger than 80 by 10 and the other elements by 0.1.#Code here
x = c(1:100)
y = numeric()
for (n in 1:100) {
if(x[n]<20 || x[n]>80) {
y[n] = x[n]*10
} else {
y[n] = x[n]*0.1
}
}
y [1] 10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0 90.0 100.0
[11] 110.0 120.0 130.0 140.0 150.0 160.0 170.0 180.0 190.0 2.0
[21] 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0
[31] 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4.0
[41] 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
[51] 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6.0
[61] 6.1 6.2 6.3 6.4 6.5 6.6 6.7 6.8 6.9 7.0
[71] 7.1 7.2 7.3 7.4 7.5 7.6 7.7 7.8 7.9 8.0
[81] 810.0 820.0 830.0 840.0 850.0 860.0 870.0 880.0 890.0 900.0
[91] 910.0 920.0 930.0 940.0 950.0 960.0 970.0 980.0 990.0 1000.0
function to combine questions 2 and 3, so that you can feed it an integer n you like (as argument). The function 1) generates n random number, 2) multiplies the elements which are smaller than 20 or larger than 80 by 10 and the other elements by 0.1, then 3) returns the mean of the square root of the vector.#Code here
#create function generate 'n' random numbers
func = function(n){
mdata=rnorm(n, mean = 0, sd = 1)
#multiplies x<20 or x>80 by 10 others by 0.1
x = c(1:n)
y = numeric(n)
for (i in 1:n) {
if(x[i]<20 || x[i]>80) {
y[i] = x[i]*10
} else {
y[i] = x[i]*0.1
}
}
#return mean and the sqrt of a vector
sqrt_data = sqrt(abs(mdata))
result = mean(sqrt_data)
return(result)
}We will look into the distribution of baby names. Use Tidyverse to answer the questions. This data set has over 2 millions rows from 1880 to 2022 It was provided by the Social Security Administration. It has the following variables:
Read the data in R and called it bbnames:
# load packages
library(data.table)
library(tidyverse)
# Load the data - take a minute to load :)
bbnames = fread("https://pages.uwf.edu/acohen/teaching/datasets/babynames.csv", drop = "V1")
bbnames name sex counts year
<char> <char> <int> <int>
1: Mary F 7065 1880
2: Anna F 2604 1880
3: Emma F 2003 1880
4: Elizabeth F 1939 1880
5: Minnie F 1746 1880
---
2085154: Zuberi M 5 2022
2085155: Zydn M 5 2022
2085156: Zylon M 5 2022
2085157: Zymeer M 5 2022
2085158: Zymeire M 5 2022
# Code here
print(bbnames) name sex counts year
<char> <char> <int> <int>
1: Mary F 7065 1880
2: Anna F 2604 1880
3: Emma F 2003 1880
4: Elizabeth F 1939 1880
5: Minnie F 1746 1880
---
2085154: Zuberi M 5 2022
2085155: Zydn M 5 2022
2085156: Zylon M 5 2022
2085157: Zymeer M 5 2022
2085158: Zymeire M 5 2022
group_by and summarise). Then, find which year had the highest number of babies:#Code here
#Code here
sameyr = bbnames%>% group_by(year)%>%summarize(n())%>%ungroup()
print(sameyr)# A tibble: 143 × 2
year `n()`
<int> <int>
1 1880 2000
2 1881 1934
3 1882 2127
4 1883 2084
5 1884 2297
6 1885 2294
7 1886 2392
8 1887 2373
9 1888 2651
10 1889 2590
# ℹ 133 more rows
#Code here
male_pop = bbnames %>% filter(sex=="M") %>% group_by(name) %>% top_n(1,wt=n)
female_pop = bbnames %>% filter(sex=="F") %>% group_by(name) %>% top_n(1,wt=n)
print(male_pop)# A tibble: 857,363 × 4
# Groups: name [43,655]
name sex counts year
<chr> <chr> <int> <int>
1 John M 9655 1880
2 William M 9532 1880
3 James M 5927 1880
4 Charles M 5348 1880
5 George M 5126 1880
6 Frank M 3242 1880
7 Joseph M 2632 1880
8 Thomas M 2534 1880
9 Henry M 2444 1880
10 Robert M 2415 1880
# ℹ 857,353 more rows
print(female_pop)# A tibble: 1,227,795 × 4
# Groups: name [70,227]
name sex counts year
<chr> <chr> <int> <int>
1 Mary F 7065 1880
2 Anna F 2604 1880
3 Emma F 2003 1880
4 Elizabeth F 1939 1880
5 Minnie F 1746 1880
6 Margaret F 1578 1880
7 Ida F 1472 1880
8 Alice F 1414 1880
9 Bertha F 1320 1880
10 Sarah F 1288 1880
# ℹ 1,227,785 more rows
age and filter by age - Pick a threshold that would keep only people who may still alive (you may use the age expectancy):#Code here
library(tidyverse)
new_df <-bbnames %>%
mutate(age = sample(1:100, n(), replace = TRUE))
filtered_df <- new_df %>%
filter(age <= 80)
print(filtered_df) name sex counts year age
<char> <char> <int> <int> <int>
1: Mary F 7065 1880 13
2: Anna F 2604 1880 61
3: Emma F 2003 1880 60
4: Elizabeth F 1939 1880 61
5: Minnie F 1746 1880 4
---
1668422: Zoumana M 5 2022 2
1668423: Zydn M 5 2022 41
1668424: Zylon M 5 2022 60
1668425: Zymeer M 5 2022 39
1668426: Zymeire M 5 2022 79
year and y-axis is counts. Use geom_bar(), geom_line(), and facet_wrap() to separate females and males (use scale="free" to free the scales) .#Code here
# Load the tidyverse package
library(tidyverse)
# Create a sample data frame
df <- bbnames
# Choose a name to filter by
chosen_name <- "John"
# Filter the data for the chosen name
filtered_df <- df %>%
filter(name == chosen_name)
# Plot the distribution using ggplot2
ggplot(filtered_df, aes(x = year, y = counts, group = sex, color = sex)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +
geom_line(size = 1) +
facet_wrap(~ sex, scales = "free") +
labs(title = paste("Distribution of", chosen_name, "over the Years"),
x = "Year",
y = "Counts") +
theme_minimal()year and y-axis is counts. Use geom_bar(), geom_line(), and facet_wrap() to separate females and males (use scale="free" to free the scales) .#Code here
# Load the tidyverse package
library(tidyverse)
# Create a sample data frame
df <- bbnames
# Choose a name to filter by
chosen_name <- "Peter"
# Filter the data for the chosen name
filtered_df <- df %>%
filter(name == chosen_name)
# Plot the distribution using ggplot2
ggplot(filtered_df, aes(x = year, y = counts, group = sex, color = sex)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +
geom_line(size = 1) +
facet_wrap(~ sex, scales = "free") +
labs(title = paste("Distribution of", chosen_name, "over the Years"),
x = "Year",
y = "Counts") +
theme_minimal()Data was obtained from the Federation Aviation Administration (FAA) in June 2023 on pilot certification. The data has over 450000 pilots records and contained the following:
Read the data in R and called it pilots:
# Code here
install.packages("pilots", repos = "https://cloud.r-project.org")
# load packages
library(data.table)
library(tidyverse)
# Load the data
pilots = fread("https://pages.uwf.edu/acohen/teaching/datasets/pilotscertification.csv")
pilots ID STATE MedClass MedExpMonth MedExpYear CertLevel
<char> <char> <int> <int> <int> <char>
1: A0000014 FL 3 10 2023 Airline
2: A0000030 GA 3 8 2019 Private
3: A0000087 NH NA NA NA Airline
4: A0000113 CA 1 11 2023 Airline
5: A0000221 AZ 1 8 2023 Airline
---
450693: C1819748 FL NA NA NA Student
450694: C1819777 IN NA NA NA Student
450695: C1819942 FL NA NA NA Student
450696: C1820011 OH 3 5 2025 Student
450697: C1820025 GA NA NA NA Student
#Code here
# Code here
pilots %>% group_by(CertLevel,MedExpMonth,MedExpYear) %>% summarise(n()) %>% filter(MedExpMonth==6 , MedExpYear==2024)# A tibble: 6 × 4
# Groups: CertLevel, MedExpMonth [6]
CertLevel MedExpMonth MedExpYear `n()`
<chr> <int> <int> <int>
1 Airline 6 2024 139
2 Commercial 6 2024 402
3 Private 6 2024 2227
4 Recreational 6 2024 2
5 Sport 6 2024 2
6 Student 6 2024 262