#read data file
midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
#loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
#count state entries
midwest %>%
count(state)
## state n
## 1 IL 102
## 2 IN 92
## 3 MI 83
## 4 OH 88
## 5 WI 72
# Group by state and summarize population and area
grouped_by_state <- midwest %>%
group_by(state) %>%
summarize(
totalPopulation = sum(poptotal),
totalArea = sum(area),
density = totalPopulation/totalArea
)
grouped_by_state
## # A tibble: 5 × 4
## state totalPopulation totalArea density
## <chr> <int> <dbl> <dbl>
## 1 IL 11430602 3.30 3459625.
## 2 IN 5544159 2.13 2606563.
## 3 MI 9295297 3.36 2768930.
## 4 OH 10847115 2.42 4480428.
## 5 WI 4891769 3.29 1488670.
ggplot(grouped_by_state, aes(x=state, y=totalPopulation))+
geom_count()

# sorting (arranging) data based on the value in a column
arrange(grouped_by_state, totalPopulation)
## # A tibble: 5 × 4
## state totalPopulation totalArea density
## <chr> <int> <dbl> <dbl>
## 1 WI 4891769 3.29 1488670.
## 2 IN 5544159 2.13 2606563.
## 3 MI 9295297 3.36 2768930.
## 4 OH 10847115 2.42 4480428.
## 5 IL 11430602 3.30 3459625.
#group by race and summarize different race percentage
grouped_by_race <- midwest %>%
group_by(state) %>%
summarize(
MedianW = median(popwhite),
MedianB = median(popblack),
MedianA = median(popasian),
MedianO = median(popother),
totalW=sum(popwhite),
totalB=sum(popblack)
)
ggplot(grouped_by_race, aes(x = state)) +
geom_bar(aes(y = MedianW), fill = "lightblue", stat = "identity", position = "dodge") +
geom_bar(aes(y = MedianB), fill = "yellow", stat = "identity", position = "dodge") +
geom_bar(aes(y = MedianA), fill = "green", stat = "identity", position = "dodge") +
geom_bar(aes(y = MedianO), fill = "orange", stat = "identity", position = "dodge") +
labs(title = "Median Populations of Different Races by State",
x = "State",
y = "Median Population") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 4, hjust = 0.1))

#probability of different population in midwest
total_population_midwest = sum(midwest$poptotal)
total_population_midwest
## [1] 42008942
#print("total population of midwest: ",total_population_midwest)
grouped_by_race$state
## [1] "IL" "IN" "MI" "OH" "WI"
#grouped_by_race$totalW
probW = grouped_by_race$totalW / total_population_midwest
print("white population of midwest below:")
## [1] "white population of midwest below:"
probW
## [1] 0.2131208 0.1195150 0.1846294 0.2266602 0.1074182
#grouped_by_race$totalW
probB = grouped_by_race$totalB / total_population_midwest
print("Black population of midwest below")
## [1] "Black population of midwest below"
probB
## [1] 0.040331247 0.010285715 0.030748358 0.027490004 0.005821118