#read data file
midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
#loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

#count state entries
midwest %>%
  count(state)
##   state   n
## 1    IL 102
## 2    IN  92
## 3    MI  83
## 4    OH  88
## 5    WI  72
#  Group by state and summarize  population and area
grouped_by_state <- midwest %>%
  group_by(state) %>%
  summarize(
    totalPopulation = sum(poptotal),
    totalArea = sum(area),
    density = totalPopulation/totalArea
  )
grouped_by_state
## # A tibble: 5 × 4
##   state totalPopulation totalArea  density
##   <chr>           <int>     <dbl>    <dbl>
## 1 IL           11430602      3.30 3459625.
## 2 IN            5544159      2.13 2606563.
## 3 MI            9295297      3.36 2768930.
## 4 OH           10847115      2.42 4480428.
## 5 WI            4891769      3.29 1488670.
ggplot(grouped_by_state, aes(x=state, y=totalPopulation))+
  geom_count()

# sorting (arranging) data based on the value in a column
arrange(grouped_by_state, totalPopulation)
## # A tibble: 5 × 4
##   state totalPopulation totalArea  density
##   <chr>           <int>     <dbl>    <dbl>
## 1 WI            4891769      3.29 1488670.
## 2 IN            5544159      2.13 2606563.
## 3 MI            9295297      3.36 2768930.
## 4 OH           10847115      2.42 4480428.
## 5 IL           11430602      3.30 3459625.
#group by race and summarize different race percentage
grouped_by_race <- midwest %>%
  group_by(state) %>%
  summarize(
    MedianW = median(popwhite),
    MedianB = median(popblack),
    MedianA = median(popasian),
    MedianO = median(popother),
    totalW=sum(popwhite),
    totalB=sum(popblack)
  )
ggplot(grouped_by_race, aes(x = state)) +
  geom_bar(aes(y = MedianW), fill = "lightblue",  stat = "identity", position = "dodge") +
  geom_bar(aes(y = MedianB), fill = "yellow",  stat = "identity", position = "dodge") +
  geom_bar(aes(y = MedianA), fill = "green",  stat = "identity", position = "dodge") +
  geom_bar(aes(y = MedianO), fill = "orange",  stat = "identity", position = "dodge") +
  labs(title = "Median Populations of Different Races by State",
       x = "State",
       y = "Median Population") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 4, hjust = 0.1))

#probability of different population in midwest
total_population_midwest = sum(midwest$poptotal)
total_population_midwest
## [1] 42008942
#print("total population of midwest: ",total_population_midwest)

grouped_by_race$state
## [1] "IL" "IN" "MI" "OH" "WI"
#grouped_by_race$totalW
probW = grouped_by_race$totalW / total_population_midwest
print("white population of midwest below:")
## [1] "white population of midwest below:"
probW
## [1] 0.2131208 0.1195150 0.1846294 0.2266602 0.1074182
#grouped_by_race$totalW
probB = grouped_by_race$totalB / total_population_midwest
print("Black population of midwest below")
## [1] "Black population of midwest below"
probB
## [1] 0.040331247 0.010285715 0.030748358 0.027490004 0.005821118