The data set for this section included population estimates
based on census data as well as immigration and deaths for the United
States. The data set contained population estimates from April to
September of 1990 for each sex, race, and age.
“Monthly Postcensal Resident Population, by single year of age, sex,
race, and Hispanic origin” https://www.census.gov/data/datasets/time-series/demo/popest/1990s-national.html
The data set was a text file with each character in the file contained
information on the population. The position of each character defined
what the data represented. Each line had 222 characters. The positions
of each character of each line is listed below.
1-2 Character Series 3-4 Numeric Month 5-8 Numeric Year 9-11 Numeric Age (years) 12 (blank) (blank) 13-22 Numeric Total population 23-32 Numeric Total male population 33-42 Numeric Total female population 43-52 Numeric White male population 53-62 Numeric White female population 63-72 Numeric Black male population 73-82 Numeric Black female population 83-92 Numeric American Indian, Eskimo, and Aleut male population 93-102 Numeric American Indian, Eskimo, and Aleut female population 103-112 Numeric Asian and Pacific Islander male population 113-122 Numeric Asian and Pacific Islander female population 123-132 Numeric Hispanic male population 133-142 Numeric Hispanic female population 143-152 Numeric White, non-Hispanic male population 153-162 Numeric White, non-Hispanic female population 163-172 Numeric Black, non-Hispanic male population 173-182 Numeric Black, non-Hispanic female population 183-192 Numeric American Indian, Eskimo, and Aleut, non-Hispanic male population 193-202 Numeric American Indian, Eskimo, and Aleut, non-Hispanic female population 203-212 Numeric Asian and Pacific Islander, non- Hispanic male population 213-222 Numeric Asian and Pacific Islander, non- Hispanic female population
First the data was read into a data frame. The data was made more tidy by making each observation include sex and race. some basic analysis was included.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RCurl)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
Relevant libraries were loaded.
aprsep1990df <- read.table(
"https://www2.census.gov/programs-surveys/popest/datasets/1990-2000/national/asrh/e9090rmp.txt",
sep="\t", header=FALSE)
charseries <- data.frame(substr(aprsep1990df[,1], 1,2))
month <- data.frame(substr(aprsep1990df[,1], 3,4))
year <- data.frame(substr(aprsep1990df[,1], 5,8))
age <- data.frame(substr(aprsep1990df[,1], 9,11))
totpop <- data.frame(substr(aprsep1990df[,1], 13,22))
totmalepop <- data.frame(substr(aprsep1990df[,1], 23,32))
totfemalepop <- data.frame(substr(aprsep1990df[,1], 33,42))
whtmalepop <- data.frame(substr(aprsep1990df[,1], 43,52))
whtfemalepop <- data.frame(substr(aprsep1990df[,1], 53,62))
blkmalepop <- data.frame(substr(aprsep1990df[,1], 63,72))
blkfemalepop <- data.frame(substr(aprsep1990df[,1], 73,82))
nativemalepop <- data.frame(substr(aprsep1990df[,1], 83,92))
nativefemalepop <- data.frame(substr(aprsep1990df[,1], 93,102))
asianmalepop <- data.frame(substr(aprsep1990df[,1], 103,112))
asianfemalepop <- data.frame(substr(aprsep1990df[,1], 113,122))
hismalepop <- data.frame(substr(aprsep1990df[,1], 123,132))
hisfemalepop <- data.frame(substr(aprsep1990df[,1], 133,142))
whtnonhismalepop <- data.frame(substr(aprsep1990df[,1], 143,152))
whtnonhisfemalepop <- data.frame(substr(aprsep1990df[,1], 153,162))
blknonhismalepop <- data.frame(substr(aprsep1990df[,1], 163,172))
blknonhisfemalepop <- data.frame(substr(aprsep1990df[,1], 173,182))
nativenonhismalepop <- data.frame(substr(aprsep1990df[,1], 183,192))
nativenonhisfemalepop <- data.frame(substr(aprsep1990df[,1], 193,202))
asiannonhismalepop <- data.frame(substr(aprsep1990df[,1], 203,212))
asiannonhisfemalepop <- data.frame(substr(aprsep1990df[,1], 213,222))
#put each population or time info into a dataframe
aprsep1990df <- cbind(charseries,month,year,age, totpop,totmalepop,totfemalepop,whtmalepop,whtfemalepop,blkmalepop,blkfemalepop,nativemalepop,nativefemalepop,asianmalepop,asianfemalepop,hismalepop,hisfemalepop,whtnonhismalepop,whtnonhisfemalepop,blknonhismalepop,blknonhisfemalepop,nativenonhismalepop,nativenonhisfemalepop,asiannonhismalepop,asiannonhisfemalepop)
#combine all the data frames into a single data frame
colnames(aprsep1990df) <- c("character_series","month","year","age","all_both_population","all_male_population","all_female_population","white_male_population","white_female_population","black_male_population","black_female_population","american_indian_eskimo_aleut_male_population","american_indian_eskimo_aleut_female_population","asian_pacific_islander_male_population","asian_pacific_islander_female_population","hispanic_male_population","hispanic_female_population","white_non-hispanic_male_population","white_non-hispanic_female_population","black_non-hispanic_male_population","black_non-hispanic_female_population", "american_indian_eskimo_aleut_non-hispanic_male_population","american_indian_eskimo_aleut_non-hispanic_female_population","asian_pacific_islander_non-hispanic_male_population","asian_pacific_islander_non-hispanic_female_population")
# change column names of all the columns in the data frame
write.csv(aprsep1990df, "aprsep1990.csv")
#original data frame
head(aprsep1990df)
## character_series month year age all_both_population all_male_population
## 1 9P 4 1990 999 248790925 121284188
## 2 9P 4 1990 0 3947307 2019079
## 3 9P 4 1990 1 3769554 1928747
## 4 9P 4 1990 2 3702679 1894973
## 5 9P 4 1990 3 3641649 1863343
## 6 9P 4 1990 4 3703886 1896723
## all_female_population white_male_population white_female_population
## 1 127506737 102163005 106577945
## 2 1928228 1603974 1523764
## 3 1840807 1535156 1458324
## 4 1807706 1514045 1437469
## 5 1778306 1496293 1420710
## 6 1807163 1526646 1446238
## black_male_population black_female_population
## 1 14439402 16077514
## 2 322783 315979
## 3 303886 297151
## 4 294040 286771
## 5 282653 276550
## 6 284805 278427
## american_indian_eskimo_aleut_male_population
## 1 1024828
## 2 24342
## 3 22574
## 4 22029
## 5 21547
## 6 21776
## american_indian_eskimo_aleut_female_population
## 1 1041719
## 2 23538
## 3 21857
## 4 21185
## 5 20759
## 6 21045
## asian_pacific_islander_male_population
## 1 3656953
## 2 67980
## 3 67131
## 4 64859
## 5 62850
## 6 63496
## asian_pacific_islander_female_population hispanic_male_population
## 1 3809559 11401617
## 2 64947 279052
## 3 63475 257016
## 4 62281 249443
## 5 60287 237887
## 6 61453 237275
## hispanic_female_population white_non-hispanic_male_population
## 1 10976922 91751437
## 2 268506 1351081
## 3 246165 1302268
## 4 238698 1287830
## 5 228037 1280148
## 6 227420 1310612
## white_non-hispanic_female_population black_non-hispanic_male_population
## 1 96563875 13824966
## 2 1280144 306538
## 3 1235384 288961
## 4 1220767 279484
## 5 1213685 269248
## 6 1239616 271876
## black_non-hispanic_female_population
## 1 15479132
## 2 300541
## 3 282733
## 4 273163
## 5 263684
## 6 265657
## american_indian_eskimo_aleut_non-hispanic_male_population
## 1 884914
## 2 20338
## 3 18941
## 4 18551
## 5 18208
## 6 18451
## american_indian_eskimo_aleut_non-hispanic_female_population
## 1 912114
## 2 19749
## 3 18335
## 4 17765
## 5 17429
## 6 17841
## asian_pacific_islander_non-hispanic_male_population
## 1 3421254
## 2 62070
## 3 61561
## 4 59665
## 5 57852
## 6 58509
## asian_pacific_islander_non-hispanic_female_population
## 1 3574694
## 2 59288
## 3 58190
## 4 57313
## 5 55471
## 6 56629
Data was scrapped. The information from each line was extracted from the data frame using the substr function and then reorganized and put together into a single data frame.
aprsep1990df <- aprsep1990df %>%
pivot_longer(
all_both_population:`asian_pacific_islander_non-hispanic_female_population`,
names_to = "race and gender",
values_to = "population",
values_drop_na = TRUE
)
#pivoted the race and gender columns into a single column.
aprsep1990df <- aprsep1990df %>%
mutate(sex = str_match(`race and gender`, "male|female|both"))
aprsep1990df <- aprsep1990df %>%
mutate(race = str_match(`race and gender`, "white_non-hispanic|black_non-hispanic|american_indian_eskimo_aleut_non-hispanic|asian_pacific_islander_non-hispanic|all|white|black|american_indian_eskimo_aleut|asian_pacific_islander|hispanic"))
#the gender was extracted and put into it's own column "sex"
tidy1990df <- aprsep1990df[ -c(1,5) ]
#removed the character series since it contained no relevant information. removed the race and gender column, since race and sex was put into their own columns.
tidy1990df <- tidy1990df[, c(1, 2, 3, 5, 6, 4)]
#reordered the columns.
write.csv(tidy1990df, "tidy1990.csv")
#tidy data
names(tidy1990df)
## [1] "month" "year" "age" "sex" "race"
## [6] "population"
tidy1990df[, 1] <- sapply(tidy1990df[, 1], as.integer)
tidy1990df[, 6] <- sapply(tidy1990df[, 6], as.integer)
#population and month were converted from characters into numbers.
head(tidy1990df)
## # A tibble: 6 × 6
## month year age sex[,1] race[,1] population
## <int> <chr> <chr> <chr> <chr> <int>
## 1 4 1990 999 both all 248790925
## 2 4 1990 999 male all 121284188
## 3 4 1990 999 female all 127506737
## 4 4 1990 999 male white 102163005
## 5 4 1990 999 female white 106577945
## 6 4 1990 999 male black 14439402
summary(tidy1990df)
## month year age sex.V1
## Min. :4.0 Length:12852 Length:12852 Length:12852
## 1st Qu.:5.0 Class :character Class :character Class :character
## Median :6.5 Mode :character Mode :character Mode :character
## Mean :6.5
## 3rd Qu.:8.0
## Max. :9.0
## race.V1 population
## Length:12852 Min. : 28
## Class :character 1st Qu.: 16117
## Mode :character Median : 73358
## Mean : 931318
## 3rd Qu.: 768077
## Max. :249964444
Initially, there were columns containing the sex, race, and gender of each combination and the population of each (e.g. white male population). Those column names were put into their own column and their populations were put in a column called population. The gender/sex and race were separated into two columns.
tidy1990df |> filter( age == "999" & sex != "both" & race == "all") |>
ggplot(aes(x=month, y=population, fill = sex, color= sex, group = sex))+
geom_point() +
labs(title = "All Populations")
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
all_male_pop <- tidy1990df |> filter( age == "999" & sex == "male" & race == "all")
all_female_pop <- tidy1990df |> filter( age == "999" & sex == "female" & race == "all")
male_model <- lm(population ~ month, data = all_male_pop)
female_model <- lm(population ~ month, data = all_female_pop)
summary(tidy1990df)
## month year age sex.V1
## Min. :4.0 Length:12852 Length:12852 Length:12852
## 1st Qu.:5.0 Class :character Class :character Class :character
## Median :6.5 Mode :character Mode :character Mode :character
## Mean :6.5
## 3rd Qu.:8.0
## Max. :9.0
## race.V1 population
## Length:12852 Min. : 28
## Class :character 1st Qu.: 16117
## Mode :character Median : 73358
## Mean : 931318
## 3rd Qu.: 768077
## Max. :249964444
summary(male_model)
##
## Call:
## lm(formula = population ~ month, data = all_male_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 6942.9 -4269.3 -4236.4 -2259.6 -408.7 4231.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120809069 8330 14502.03 < 2e-16 ***
## month 117044 1240 94.42 7.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5185 on 4 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9994
## F-statistic: 8916 on 1 and 4 DF, p-value: 7.542e-08
coef(male_model)
## (Intercept) month
## 120809068.6 117044.1
summary(female_model)
##
## Call:
## lm(formula = population ~ month, data = all_female_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 11988 -3163 -7793 -11358 -1194 11520
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 127021566 17554 7235.86 2.19e-15 ***
## month 118296 2612 45.29 1.42e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10930 on 4 degrees of freedom
## Multiple R-squared: 0.9981, Adjusted R-squared: 0.9976
## F-statistic: 2051 on 1 and 4 DF, p-value: 1.422e-06
coef(female_model)
## (Intercept) month
## 127021565.5 118295.8
Interestingly, overall there are more females than males from April 1990 to September 1990. The slope of the female population is greater than the male’s. The overall number of females was larger and grew faster during this time period.
tidy1990df |> filter( age == "999" & sex == "male"& race != "all") |>
ggplot(aes(x=month, y=population, fill = race, color= race, group = race))+
geom_line() +
labs(title = "Male Populations")
white_male_pop <- tidy1990df |> filter( age == "999" & sex == "male" & race == "white")
black_male_pop <- tidy1990df |> filter( age == "999" & sex == "male" & race == "black")
hispanic_male_pop <- tidy1990df |> filter( age == "999" & sex == "male" & race == "hispanic")
asian_male_pop <- tidy1990df |> filter( age == "999" & sex == "male" & race == "asian_pacific_islander")
native_male_pop <- tidy1990df |> filter( age == "999" & sex == "male" & race == "american_indian_eskimo_aleut")
white_male_model <- lm(population ~ month, data = white_male_pop)
black_male_model <- lm(population ~ month, data = black_male_pop)
hispanic_male_model <- lm(population ~ month, data = hispanic_male_pop)
asian_male_model <- lm(population ~ month, data = asian_male_pop)
native_male_model <- lm(population ~ month, data = native_male_pop)
summary(white_male_model)
##
## Call:
## lm(formula = population ~ month, data = white_male_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 4117 -3030 -1820 -1136 -734 2603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.018e+08 4.955e+03 20552.8 < 2e-16 ***
## month 8.049e+04 7.373e+02 109.2 4.22e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3084 on 4 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9996
## F-statistic: 1.192e+04 on 1 and 4 DF, p-value: 4.221e-08
coef(white_male_model)
## (Intercept) month
## 101836918.46 80492.31
summary(black_male_model)
##
## Call:
## lm(formula = population ~ month, data = black_male_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 2215.8 -394.4 -1717.6 -2061.8 -224.9 2182.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14357658 3320 4324.72 1.72e-14 ***
## month 19882 494 40.25 2.28e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2067 on 4 degrees of freedom
## Multiple R-squared: 0.9975, Adjusted R-squared: 0.9969
## F-statistic: 1620 on 1 and 4 DF, p-value: 2.277e-06
coef(black_male_model)
## (Intercept) month
## 14357657.55 19882.17
summary(hispanic_male_model)
##
## Call:
## lm(formula = population ~ month, data = hispanic_male_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 1660.7 -832.2 -882.1 -1036.9 -254.8 1345.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.127e+07 2.152e+03 5236.2 7.98e-15 ***
## month 3.269e+04 3.202e+02 102.1 5.52e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1340 on 4 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9995
## F-statistic: 1.042e+04 on 1 and 4 DF, p-value: 5.523e-08
coef(hispanic_male_model)
## (Intercept) month
## 11269200.74 32688.89
summary(asian_male_model)
##
## Call:
## lm(formula = population ~ month, data = asian_male_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 474.0 -769.6 -630.3 1030.0 539.3 -643.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3595812.2 1386.7 2593.14 1.33e-13 ***
## month 15166.7 206.3 73.51 2.05e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 863.1 on 4 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9991
## F-statistic: 5403 on 1 and 4 DF, p-value: 2.053e-07
coef(asian_male_model)
## (Intercept) month
## 3595812.21 15166.69
summary(native_male_model)
##
## Call:
## lm(formula = population ~ month, data = native_male_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 135.76 -75.21 -68.18 -92.15 10.88 88.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.019e+06 1.709e+02 5961.63 4.75e-15 ***
## month 1.503e+03 2.543e+01 59.11 4.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 106.4 on 4 degrees of freedom
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9986
## F-statistic: 3494 on 1 and 4 DF, p-value: 4.904e-07
coef(native_male_model)
## (Intercept) month
## 1018680.352 1502.971
The intercept of these lines give us an estimate of the population size in the beginning of 1990. The slopes show the population growth per month. Looking at the males of each race: white male: slope: 80492, intercept: 101836918, black male: slope: 19882, intercept: 14357658, hispanic male: slope: 32689, intercept: 11269201, asian or pacific islander male: slope: 15167, intercept: 3595812, american indian or eskimo or aleut male: slope: 1503, intercept: 1018680
tidy1990df |> filter( age == "999" & sex == "female" & race != "all") |>
ggplot(aes(x=month, y=population, fill = race, color= race, group = race))+
geom_line() +
labs(title = "Female Populations")
white_female_pop <- tidy1990df |> filter( age == "999" & sex == "female" & race == "white")
black_female_pop <- tidy1990df |> filter( age == "999" & sex == "female" & race == "black")
hispanic_female_pop <- tidy1990df |> filter( age == "999" & sex == "female" & race == "hispanic")
asian_female_pop <- tidy1990df |> filter( age == "999" & sex == "female" & race == "asian_pacific_islander")
native_female_pop <- tidy1990df |> filter( age == "999" & sex == "female" & race == "american_indian_eskimo_aleut")
white_female_model <- lm(population ~ month, data = white_female_pop)
black_female_model <- lm(population ~ month, data = black_female_pop)
hispanic_female_model <- lm(population ~ month, data = hispanic_female_pop)
asian_female_model <- lm(population ~ month, data = asian_female_pop)
native_female_model <- lm(population ~ month, data = native_female_pop)
summary(white_female_model)
##
## Call:
## lm(formula = population ~ month, data = white_female_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 7834 -1942 -4591 -8109 -1411 8219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106257622 11955 8888.04 9.61e-16 ***
## month 78122 1779 43.92 1.61e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7442 on 4 degrees of freedom
## Multiple R-squared: 0.9979, Adjusted R-squared: 0.9974
## F-statistic: 1929 on 1 and 4 DF, p-value: 1.607e-06
coef(white_female_model)
## (Intercept) month
## 106257622.18 78122.23
summary(black_female_model)
##
## Call:
## lm(formula = population ~ month, data = black_female_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 3300.3 -355.2 -2333.7 -3869.3 -340.8 3598.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.599e+07 5.359e+03 2983.76 7.57e-14 ***
## month 2.136e+04 7.973e+02 26.79 1.15e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3336 on 4 degrees of freedom
## Multiple R-squared: 0.9945, Adjusted R-squared: 0.9931
## F-statistic: 717.7 on 1 and 4 DF, p-value: 1.154e-05
coef(black_female_model)
## (Intercept) month
## 15988771.66 21360.51
summary(hispanic_female_model)
##
## Call:
## lm(formula = population ~ month, data = hispanic_female_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 1868.2 -784.8 -924.7 -1534.6 -358.6 1734.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.084e+07 2.597e+03 4174.14 1.98e-14 ***
## month 3.363e+04 3.864e+02 87.04 1.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1617 on 4 degrees of freedom
## Multiple R-squared: 0.9995, Adjusted R-squared: 0.9993
## F-statistic: 7575 on 1 and 4 DF, p-value: 1.045e-07
coef(hispanic_female_model)
## (Intercept) month
## 10840518.04 33633.94
summary(asian_female_model)
##
## Call:
## lm(formula = population ~ month, data = asian_female_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 727.0 -776.8 -836.6 710.6 560.8 -385.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3739941 1344 2782.68 1.00e-13 ***
## month 17223 200 86.12 1.09e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 836.6 on 4 degrees of freedom
## Multiple R-squared: 0.9995, Adjusted R-squared: 0.9993
## F-statistic: 7417 on 1 and 4 DF, p-value: 1.09e-07
coef(asian_female_model)
## (Intercept) month
## 3739940.8 17222.8
summary(native_female_model)
##
## Call:
## lm(formula = population ~ month, data = native_female_pop)
##
## Residuals:
## 1 2 3 4 5 6
## 127.238 -88.990 -32.219 -90.448 -2.676 87.095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.035e+06 1.625e+02 6370.99 3.64e-15 ***
## month 1.590e+03 2.418e+01 65.77 3.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101.1 on 4 degrees of freedom
## Multiple R-squared: 0.9991, Adjusted R-squared: 0.9988
## F-statistic: 4326 on 1 and 4 DF, p-value: 3.201e-07
coef(native_female_model)
## (Intercept) month
## 1035230.848 1590.229
The intercept of these lines give us an estimate of the population size in the beginning of 1990. The slopes show the population growth per month. Looking at the females of each race:
white female: slope: 78122, intercept: 106257622, black female: slope: 21361, intercept: 15988772, hispanic female: slope: 33634, intercept: 10840518, asian or pacific islander female: slope: 17223, intercept: 3739941, american indian or eskimo or aleut female: slope: 1590, intercept: 1035231
White males were increasing faster than white females, white females had a higher overall population. Black females were greater in number and are increased faster than black males. There are more hispanic males than females, but hispanic females’ population grew slightly faster. Asian females were also more numerous and population was growing more than asian males. For native americans there were slightly more females and female population grew slightly faster.
tidy1990df |> filter( month == "9" & sex == "both" & race == "all" & age != "999") |>
ggplot(aes(x=age, y=population))+
geom_col()
population_age <- tidy1990df |> filter( month == "9" & sex == "both" & race == "all" & age != "999")
population_age$age[which.max(population_age$population)]
## [1] " 30"
Questions involving age may also be answered. The age with the greatest population is 30 for everyone in September 1990.
Data presented was read into a data frame and then made more tidy by pivoting the sex and race of the populations. With some filtering the data can be analyzed. The populations of males overall were less and growing less than females. The age with the greatest population is 30 for September 1990.
Project 2 simple data / section 2:
author: Keith
DeNivo, Adriana Medina
date: 2024-03-02
Untidy/wide data set was sourced by this PDF: “https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjQ3b6x-cyEAxUWlYkEHbQwBPoQFnoECA0QAQ&url=https%3A%2F%2Fcommunity.esri.com%2Fccqpr47374%2Fattachments%2Fccqpr47374%2Farcgis-insights-questions%2F976%2F1%2FData%2520Reshaping%2520Doc%2520Formatted.pdf&usg=AOvVaw1rmDMqBCK-e4Nze3fx4Tgm&opi=89978449” It is a pdf that contains a lesson of what is long and what is wide data. The particular dataset is used is for select states and their population for each year from 2015 to 2020. CSV is posted by Natalie Kalukeerthie. Tidying done by Keith DeNivo. Analysis by Adriana Medina.
library(dplyr)
library(RCurl)
library(tidyr)
library(tidyverse)
Loaded relevant Libraries: tidyr and dplyr.
c <- getURL("https://raw.githubusercontent.com/nk014914/Data-607/main/Population_data.csv")
#read in the raw file
popdata_df <- data.frame(read.csv(text = c ))
#convert file to data frame popdata_df
head(popdata_df)
## ï..Name FIPS Abbreviations X2015.Population X2016.Population
## 1 Georgia 14 GA 10067378 10189016
## 2 North Carolina 37 NC 9932862 10036881
## 3 NA NA NA
## 4 South Carolina 45 SC 4817440 4879157
## 5 Virginia 51 VA 8254218 8312400
## X2017.Population X2018.Population X2019.Population X2020.Population
## 1 10325943 10471428 10604413 10722092
## 2 10151700 10291929 10455811 10598314
## 3 NA NA NA NA
## 4 4954035 5036155 5118397 5196026
## 5 8370206 8420184 8483598 8539322
csv was converted into a data frame.
popdata_df <- popdata_df[-c(3), ]
#remove blank row
head(popdata_df)
## ï..Name FIPS Abbreviations X2015.Population X2016.Population
## 1 Georgia 14 GA 10067378 10189016
## 2 North Carolina 37 NC 9932862 10036881
## 4 South Carolina 45 SC 4817440 4879157
## 5 Virginia 51 VA 8254218 8312400
## X2017.Population X2018.Population X2019.Population X2020.Population
## 1 10325943 10471428 10604413 10722092
## 2 10151700 10291929 10455811 10598314
## 4 4954035 5036155 5118397 5196026
## 5 8370206 8420184 8483598 8539322
names(popdata_df)
## [1] "ï..Name" "FIPS" "Abbreviations" "X2015.Population"
## [5] "X2016.Population" "X2017.Population" "X2018.Population" "X2019.Population"
## [9] "X2020.Population"
blank row containing no values was removed.
popdatatidy <- popdata_df |> pivot_longer(
X2015.Population:X2020.Population,
names_to = "year",
values_to = "population",
values_drop_na = TRUE)
#population for each year columns 2015 to 2020 were pivoted into a single column called "year". Their values was assigned to population.
Names of the columns containing the population for a given year were pivoted into a single column called “year”, so that the year could be extracted. Their values was assigned to population, so that all populations are in a single column.
popdatatidy_df <- popdatatidy %>%
mutate(year = str_match(year, "[0-9]+"))
head(popdatatidy_df)
## # A tibble: 6 × 5
## ï..Name FIPS Abbreviations year[,1] population
## <chr> <int> <chr> <chr> <dbl>
## 1 Georgia 14 GA 2015 10067378
## 2 Georgia 14 GA 2016 10189016
## 3 Georgia 14 GA 2017 10325943
## 4 Georgia 14 GA 2018 10471428
## 5 Georgia 14 GA 2019 10604413
## 6 Georgia 14 GA 2020 10722092
#extracted the digits for year in the column year
There were extra characters in the Name column : “ï..Name”
# Remove extra characters from column names
colnames(popdatatidy_df) <- gsub("ï..", "", colnames(popdatatidy_df))
print(colnames(popdatatidy_df))
## [1] "Name" "FIPS" "Abbreviations" "year"
## [5] "population"
head(popdatatidy_df)
## # A tibble: 6 × 5
## Name FIPS Abbreviations year[,1] population
## <chr> <int> <chr> <chr> <dbl>
## 1 Georgia 14 GA 2015 10067378
## 2 Georgia 14 GA 2016 10189016
## 3 Georgia 14 GA 2017 10325943
## 4 Georgia 14 GA 2018 10471428
## 5 Georgia 14 GA 2019 10604413
## 6 Georgia 14 GA 2020 10722092
Using Regex the year was extracted for the strings containing the year and “population” from column year.
popdatatidy_df <- popdatatidy_df[order(popdatatidy_df$year,popdatatidy_df$Name),]
head(popdatatidy_df)
## # A tibble: 6 × 5
## Name FIPS Abbreviations year[,1] population
## <chr> <int> <chr> <chr> <dbl>
## 1 Georgia 14 GA 2015 10067378
## 2 North Carolina 37 NC 2015 9932862
## 3 South Carolina 45 SC 2015 4817440
## 4 Virginia 51 VA 2015 8254218
## 5 Georgia 14 GA 2016 10189016
## 6 North Carolina 37 NC 2016 10036881
#decided to sort by ascending time and keeping states alphabetical order.
Data can be sorted as desired. Data was sorted by the year, then alphabetically by state.
colnames(popdatatidy_df)[1] <- "State"
#rename the name column containing the names of states to state
popdatatidy_df[, 4] <- sapply(popdatatidy_df[, 4], as.integer)
#converted the year to an integer from character.
head(popdatatidy_df)
## # A tibble: 6 × 5
## State FIPS Abbreviations year population
## <chr> <int> <chr> <int> <dbl>
## 1 Georgia 14 GA 2015 10067378
## 2 North Carolina 37 NC 2015 9932862
## 3 South Carolina 45 SC 2015 4817440
## 4 Virginia 51 VA 2015 8254218
## 5 Georgia 14 GA 2016 10189016
## 6 North Carolina 37 NC 2016 10036881
write.csv(popdatatidy_df, "census1990.csv")
Year was converted into an integer for potential numeric calculations. Name column was changed to state. csv of the tidy data was written
Our aim is to analyze population data for each state across a six-year period to uncover underlying patterns and trends. Furthermore, we seek to find the year that exhibits the largest population growth.
For the first part of the analysis I wanted to arrange the data by state and year, then group the data by state. For this I am using the group_by function from dplyr. This line of code computes the population growth (or decline) by subtracting the lagged population values from the current population values.
This will result in a N/A value for the year 2015 for each state because there is no prior data to that year.
#calc pop growth compared to the previous year
popdatatidy_df <- popdatatidy_df %>%
arrange(State, year) %>%
group_by(State) %>%
mutate(population_growth = population - lag(population))
ggplot(popdatatidy_df, aes(x = year, y = population_growth)) +
geom_bar(stat = "identity", fill = "pink", na.rm = TRUE) +
labs(title = "Population Growth Over 6 Years",
x = "Year",
y = "Population Growth") +
theme_minimal()
## Warning: Removed 4 rows containing missing values (`position_stack()`).
Based on the bar graph above, it is evident that 2019 is the year that has the largest population growth.
max_growth_year <- popdatatidy_df %>%
group_by(year) %>%
summarise(total_population_growth = sum(population_growth, na.rm = TRUE)) %>%
slice(which.max(total_population_growth)) %>%
pull(year)
print(max_growth_year)
## [1] 2019
This is a dataset that I’ve chosen from the Harvard Dataverse containing a list of awarded noble prizes and metadata about the winners.
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/HYRJDX
library(tidyr)
library(dplyr)
library(ggplot2)
raw_data_path<-("https://raw.githubusercontent.com/amedina613/Data-607-Project-2/main/complete.csv")
raw_data <- read.csv(raw_data_path, sep = ",")
print(colnames(raw_data))
## [1] "awardYear" "category"
## [3] "categoryFullName" "sortOrder"
## [5] "portion" "prizeAmount"
## [7] "prizeAmountAdjusted" "dateAwarded"
## [9] "prizeStatus" "motivation"
## [11] "categoryTopMotivation" "award_link"
## [13] "id" "name"
## [15] "knownName" "givenName"
## [17] "familyName" "fullName"
## [19] "penName" "gender"
## [21] "laureate_link" "birth_date"
## [23] "birth_city" "birth_cityNow"
## [25] "birth_continent" "birth_country"
## [27] "birth_countryNow" "birth_locationString"
## [29] "death_date" "death_city"
## [31] "death_cityNow" "death_continent"
## [33] "death_country" "death_countryNow"
## [35] "death_locationString" "orgName"
## [37] "nativeName" "acronym"
## [39] "org_founded_date" "org_founded_city"
## [41] "org_founded_cityNow" "org_founded_continent"
## [43] "org_founded_country" "org_founded_countryNow"
## [45] "org_founded_locationString" "ind_or_org"
## [47] "residence_1" "residence_2"
## [49] "affiliation_1" "affiliation_2"
## [51] "affiliation_3" "affiliation_4"
raw_data <- raw_data %>%
mutate(residence = coalesce(residence_1, residence_2)) %>%
select(-c(residence_1, residence_2))
raw_data <- raw_data %>%
mutate(affiliation = coalesce(affiliation_1, affiliation_2, affiliation_3, affiliation_4)) %>%
select(-c(affiliation_1, affiliation_2, affiliation_3, affiliation_4))
write.csv(raw_data, "tidynoble.csv")
This is quite a wide data set and these are just some examples of what you can do
awards <- na.omit(subset(raw_data, select = c("awardYear", "category", "prizeAmount")))
recipients <- na.omit(subset(raw_data, select = c("name", "birth_date", "gender")))
recipients_without_gender <- recipients %>%
filter(is.na(gender) | gender == "")
recipients <- recipients %>%
filter(!is.na(gender), gender != "")
I made a df for the recipients without gender to make sure everything checks out. It does, 950-27 is 923, which is the number of obs. in the filtered final df.
gender_counts <- table(recipients$gender)
proportion_women <- gender_counts["female"] / sum(gender_counts)
proportion_men <- gender_counts["male"] / sum(gender_counts)
gender_data <- data.frame(
Gender = c("Female", "Male"),
Proportion = c(proportion_women, proportion_men)
)
ggplot(gender_data, aes(x = Gender, y = Proportion, fill = Gender)) +
geom_bar(stat = "identity") +
labs(title = "Proportion of Awards by Gender",
x = "Gender",
y = "Proportion") +
scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
theme_minimal()
field_prize_sum <- awards %>%
group_by(category) %>%
summarise(total_prize_amount = sum(prizeAmount, na.rm = TRUE))
max_prize_field <- field_prize_sum %>%
filter(total_prize_amount == max(total_prize_amount))
min_prize_field <- field_prize_sum %>%
filter(total_prize_amount == min(total_prize_amount))
print("Field with the largest prize amount:")
## [1] "Field with the largest prize amount:"
print(max_prize_field)
## # A tibble: 1 × 2
## category total_prize_amount
## <chr> <int>
## 1 Physics 725890928
print("Field with the lowest prize amount:")
## [1] "Field with the lowest prize amount:"
print(min_prize_field)
## # A tibble: 1 × 2
## category total_prize_amount
## <chr> <int>
## 1 Literature 289282102
I attempted to make a bar plot to visualize the number of awards given out each year for each award type, however there was so much information in the plot, it was not decipherable. So I decided to simplify the plot by aggregating by decade.
The floor function was used for this.
awards_by_decade_type <- awards %>%
group_by(decade = floor(awardYear / 10) * 10, category) %>%
summarise(num_awards = n(), .groups = 'drop')
ggplot(awards_by_decade_type, aes(x = as.factor(decade), y = num_awards, fill = category)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Decade", y = "Number of Awards", fill = "Award Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))