library(tidyverse)

1.Dataset - MTA Daily Ridership Data: Beginning 2020

In the MTA dataset, determine the average ridership in the different transportation such as the Subway, Buses, LIRR and Metro-North. This will help us determine which transportation most people most likely would take. Since the dataset started in 2020 when the pandemic happened I wanted to find out if there were more people taking the Subway or Buses in 2020 and what the highest and lowest ridership.

To prepare the dataset I had to change it from wide to long and separate out the date column to be able to extract data based on the year. It allows me to extract the different types of transportation and their ridership.

Load .csv file

mta <- read_csv("https://raw.githubusercontent.com/AnnaMoy/Data-607/main/MTA_Daily_Ridership_Data.csv")

mta
## # A tibble: 1,452 × 15
##    Date     Subways: Total Estim…¹ Subways: % of Compar…² Buses: Total Estimat…³
##    <chr>                     <dbl>                  <dbl>                  <dbl>
##  1 03/01/2…                2212965                   0.97                 984908
##  2 03/02/2…                5329915                   0.96                2209066
##  3 03/03/2…                5481103                   0.98                2228608
##  4 03/04/2…                5498809                   0.99                2177165
##  5 03/05/2…                5496453                   0.99                2244515
##  6 03/06/2…                5189447                   0.93                2066743
##  7 03/07/2…                2814637                   0.92                1249085
##  8 03/08/2…                2120656                   0.93                 957163
##  9 03/09/2…                4973513                   0.89                2124770
## 10 03/10/2…                4867818                   0.87                2111989
## # ℹ 1,442 more rows
## # ℹ abbreviated names: ¹​`Subways: Total Estimated Ridership`,
## #   ²​`Subways: % of Comparable Pre-Pandemic Day`,
## #   ³​`Buses: Total Estimated Ridership`
## # ℹ 11 more variables: `Buses: % of Comparable Pre-Pandemic Day` <dbl>,
## #   `LIRR: Total Estimated Ridership` <dbl>,
## #   `LIRR: % of Comparable Pre-Pandemic Day` <dbl>, …

Transform data

# Change from wide to long data 
mta <- mta %>%
  pivot_longer(-Date, names_to =c("transportation","percent_prepandemic"),
               names_sep = ":", ) %>%
  pivot_wider(names_from = percent_prepandemic, values_from = value)

#separate out the dates into their own columns
mta <- mta %>%
  separate(Date, into = c("month", "date", "year")) 

#rename the column names
colnames(mta) <- c("month", "date", "year", "transportation", "total_ridership", "percent_prepandemic", "total_trips", "total_traffic")

#change certain columns to integers
mta$month <- as.integer(mta$month)
mta$date <- as.integer(mta$date)
mta$year <- as.integer(mta$year)
mta$total_ridership <- as.integer(mta$total_ridership)
mta$total_traffic <- as.integer(mta$total_traffic)
mta$total_trips <- as.integer(mta$total_trips)

mta
## # A tibble: 10,164 × 8
##    month  date  year transportation        total_ridership percent_prepandemic
##    <int> <int> <int> <chr>                           <int>               <dbl>
##  1     3     1  2020 Subways                       2212965                0.97
##  2     3     1  2020 Buses                          984908                0.99
##  3     3     1  2020 LIRR                               NA               NA   
##  4     3     1  2020 Metro-North                     55826                0.59
##  5     3     1  2020 Access-A-Ride                      NA                1.13
##  6     3     1  2020 Bridges and Tunnels                NA                0.98
##  7     3     1  2020 Staten Island Railway            1636                0.52
##  8     3     2  2020 Subways                       5329915                0.96
##  9     3     2  2020 Buses                         2209066                0.99
## 10     3     2  2020 LIRR                           321569                1.03
## # ℹ 10,154 more rows
## # ℹ 2 more variables: total_trips <int>, total_traffic <int>

Analysis

Summarize the 2020,2021, 2022 average ridership for Subway, Buses, LIRR and Metro-North.

mta_summary <- mta %>%
  filter(transportation =="Subways" & year == 2020 |transportation =="Subways" & year == 2021 | transportation =="Subways" & year == 2022 |transportation =="Buses" & year == 2020 |transportation =="Buses" & year == 2021 | transportation =="Buses" & year == 2022 | transportation =="LIRR" & year == 2020 |transportation =="LIRR" & year == 2021 | transportation =="LIRR" & year == 2022 | transportation =="Metro-North" & year == 2020 |transportation =="Metro-North" & year == 2021 | transportation =="Metro-North" & year == 2022 ) %>%
  group_by(year, transportation) %>%
  summarize(mean_rideship = mean(total_ridership), na.rm = TRUE) %>%
  arrange(desc(mean_rideship)) 

mta_summary
## # A tibble: 12 × 4
## # Groups:   year [3]
##     year transportation mean_rideship na.rm
##    <int> <chr>                  <dbl> <lgl>
##  1  2022 Subways             2773989. TRUE 
##  2  2021 Subways             2081672. TRUE 
##  3  2020 Subways             1209467. TRUE 
##  4  2022 Buses               1161498. TRUE 
##  5  2021 Buses               1045583. TRUE 
##  6  2020 Buses                481659. TRUE 
##  7  2022 LIRR                 142462. TRUE 
##  8  2022 Metro-North          126138. TRUE 
##  9  2021 LIRR                  96630. TRUE 
## 10  2021 Metro-North           72498. TRUE 
## 11  2020 Metro-North           37545. TRUE 
## 12  2020 LIRR                     NA  TRUE

Determine if there were more people taking the bus or subway in 2020.

sub_or_bus <- mta %>%
  group_by(year, transportation) %>%
  filter(transportation == "Subways" & year == 2020 | transportation == "Buses" & year == 2020) %>%
  summarize(sum_rideship = sum(total_ridership)) 

sub_or_bus
## # A tibble: 2 × 3
## # Groups:   year [1]
##    year transportation sum_rideship
##   <int> <chr>                 <int>
## 1  2020 Buses             147387699
## 2  2020 Subways           370096769

Which transportation has the highest and lowest ridership in 2020

low_high <- mta %>%
  group_by(transportation) %>%
  summarize(max = max(total_ridership), min = min(total_ridership)) %>%
  arrange(desc(max))

low_high
## # A tibble: 7 × 3
##   transportation            max    min
##   <chr>                   <int>  <int>
## 1 Subways               5498809 198399
## 2 Buses                 2244515   5498
## 3 Metro-North            224426   3281
## 4 Access-A-Ride              NA     NA
## 5 Bridges and Tunnels        NA     NA
## 6 LIRR                       NA     NA
## 7 Staten Island Railway      NA     NA

Which transportation has the highest and lowest ridership in 2023

low_high_2023 <- mta %>%
  filter(year == 2023) %>%
  group_by(transportation) %>%
  summarize(max = max(total_ridership), min = min(total_ridership)) %>%
  arrange(desc(max))

low_high_2023
## # A tibble: 7 × 3
##   transportation            max     min
##   <chr>                   <int>   <int>
## 1 Subways               4179861 1273729
## 2 Buses                 1535264  381879
## 3 LIRR                   254968   64883
## 4 Metro-North            224426   61728
## 5 Staten Island Railway    8671     959
## 6 Access-A-Ride              NA      NA
## 7 Bridges and Tunnels        NA      NA

Conclusion

The largest amount of people riding the Subways was in 2022, 2021 and 2020. In 2020 there were more people taking the subway than buses and there were up to 5,498,809 people taking the subway and at it’s low point there was only 198,399 people taking the Subway. Overall the ridership in 2023 has not returned to how it was in 2020.

2. Dataset - Global Inflation Data

In the Global Inflation Data I had to determine which region had the highest and lowest CPI which will tells us how the market basket of consumer good and services changes over time. We can take a closer look at the United States country and see the change in the recent years on how the avg. CPI changes as well as view when the min and max CPI happened.

To prepare the dataset I had to create a csv to determine the region for each country as suggested by Mohammed Rahman.

Load .csv file

global <- read_csv("https://raw.githubusercontent.com/AnnaMoy/Data-607/main/global_inflation_data.csv")
region <- read_csv("https://raw.githubusercontent.com/AnnaMoy/Data-607/main/region.csv")

#combine both files together to add in region for analysis and selecting only the columns I need
colnames(region) <- c("country_name", "region")
global <- merge(global, region, by = "country_name")

Transform data

#change from wide data to long data and removing NA
global <- global %>%
  pivot_longer(`1980`:`2024`,names_to = "year", values_to = "avg_inflationCPI", values_drop_na = TRUE) %>%
  select(country_name, region, year, avg_inflationCPI)

global 
## # A tibble: 7,952 × 4
##    country_name region         year  avg_inflationCPI
##    <chr>        <chr>          <chr>            <dbl>
##  1 Afghanistan  Asia & Pacific 1980              13.4
##  2 Afghanistan  Asia & Pacific 1981              22.2
##  3 Afghanistan  Asia & Pacific 1982              18.2
##  4 Afghanistan  Asia & Pacific 1983              15.9
##  5 Afghanistan  Asia & Pacific 1984              20.4
##  6 Afghanistan  Asia & Pacific 1985               8.7
##  7 Afghanistan  Asia & Pacific 1986              -2.1
##  8 Afghanistan  Asia & Pacific 1987              18.4
##  9 Afghanistan  Asia & Pacific 1988              27.5
## 10 Afghanistan  Asia & Pacific 1989              71.5
## # ℹ 7,942 more rows
# Separate out data for region only  analysis
region_only <- global %>%
  select(region, year, avg_inflationCPI)

region_only
## # A tibble: 7,952 × 3
##    region         year  avg_inflationCPI
##    <chr>          <chr>            <dbl>
##  1 Asia & Pacific 1980              13.4
##  2 Asia & Pacific 1981              22.2
##  3 Asia & Pacific 1982              18.2
##  4 Asia & Pacific 1983              15.9
##  5 Asia & Pacific 1984              20.4
##  6 Asia & Pacific 1985               8.7
##  7 Asia & Pacific 1986              -2.1
##  8 Asia & Pacific 1987              18.4
##  9 Asia & Pacific 1988              27.5
## 10 Asia & Pacific 1989              71.5
## # ℹ 7,942 more rows
#

Analysis Compare the inflation rate of countries in the same region

# Find the min and max for all the regions and sort it
global_max <- global %>%
  group_by(region) %>%
  summarize(max_cpi = max(avg_inflationCPI)) %>%
  arrange(desc(max_cpi))

global_min <- global %>%
  group_by(region) %>%
  summarize(min_cpi = min(avg_inflationCPI)) %>%
  arrange(min_cpi)

global_max
## # A tibble: 9 × 2
##   region                max_cpi
##   <chr>                   <dbl>
## 1 South/Latin America   65374. 
## 2 Africa                23773. 
## 3 South/Central America 13110. 
## 4 Europe                 5273. 
## 5 Asia & Pacific         3102. 
## 6 Middle east             487. 
## 7 Arab States             359. 
## 8 <NA>                    111. 
## 9 North America            13.5
global_min
## # A tibble: 9 × 2
##   region                min_cpi
##   <chr>                   <dbl>
## 1 Africa                  -72.7
## 2 Asia & Pacific          -71.3
## 3 South/Latin America     -44.4
## 4 Europe                  -11.7
## 5 Arab States             -11.3
## 6 Middle east              -9.9
## 7 <NA>                     -2.4
## 8 North America            -0.3
## 9 South/Central America     3.5
# See how region overall looks like in their CPI
region_year <- region_only %>%
  group_by(region, year) %>%
  summarize(overall = mean(avg_inflationCPI))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
region_year
## # A tibble: 405 × 3
## # Groups:   region [9]
##    region year  overall
##    <chr>  <chr>   <dbl>
##  1 Africa 1980     19.3
##  2 Africa 1981     19.3
##  3 Africa 1982     16.1
##  4 Africa 1983     22.2
##  5 Africa 1984     16.8
##  6 Africa 1985     18.9
##  7 Africa 1986     18.2
##  8 Africa 1987     26.7
##  9 Africa 1988     19.7
## 10 Africa 1989     20.6
## # ℹ 395 more rows
ggplot(region_year, aes(year, overall)) +
      geom_bar(stat= "identity", position = "dodge") +
      coord_flip() +
      facet_wrap(~region)

# Filter for one region and look at the min and max and sort the avg_inflationCPI to see when it was the lowest CPI
us <- global %>%
  filter(country_name == "United States") %>%
  mutate(min = min(avg_inflationCPI), max = max(avg_inflationCPI)) %>%
  arrange(desc(year))

us
## # A tibble: 45 × 6
##    country_name  region        year  avg_inflationCPI   min   max
##    <chr>         <chr>         <chr>            <dbl> <dbl> <dbl>
##  1 United States North America 2024               2.8  -0.3  13.5
##  2 United States North America 2023               4.1  -0.3  13.5
##  3 United States North America 2022               8    -0.3  13.5
##  4 United States North America 2021               4.7  -0.3  13.5
##  5 United States North America 2020               1.3  -0.3  13.5
##  6 United States North America 2019               1.8  -0.3  13.5
##  7 United States North America 2018               2.4  -0.3  13.5
##  8 United States North America 2017               2.1  -0.3  13.5
##  9 United States North America 2016               1.3  -0.3  13.5
## 10 United States North America 2015               0.1  -0.3  13.5
## # ℹ 35 more rows
ggplot(us, aes(year, avg_inflationCPI)) +
  geom_bar(stat = "identity", position ="dodge" ) +
  coord_flip()

Conclusion

The highest CPI across all the different regions is South/Latin America with 65374 and the lowest CPI is Africa at -72.7. The data has extreme outliers for some of the locations as you can see from the difference in CPI number. The CPI shows there is less changes in CPI for North America. When viewing the United States CPI change over time you can see the max was from 1980 and the min was from 2009. The CPI use to be really high and it decreased and has a spike back in 2022. Based on the different region South/Central America had the most changes in CPI and spikes.

3. Dataset - Laptop Prices

In the Laptop prices dataset is to find a summary statistics on the min, max, median of the prices of the different types of laptop. And finding out which laptop has the largest screen and weights the most. It would be helpful to know which Company produces the highest and lowest rams for their computers.

To be able to analyze the data I had to clean up the data and remove some of the numbers with the characters to be able to find out the min and max.

Load .csv file

laptop <- read_csv("https://raw.githubusercontent.com/AnnaMoy/Data-607/main/laptop.csv")

laptop
## # A tibble: 1,303 × 12
##    `Unnamed: 0` Company TypeName  Inches ScreenResolution     Cpu   Ram   Memory
##           <dbl> <chr>   <chr>     <chr>  <chr>                <chr> <chr> <chr> 
##  1            0 Apple   Ultrabook 13.3   IPS Panel Retina Di… Inte… 8GB   128GB…
##  2            1 Apple   Ultrabook 13.3   1440x900             Inte… 8GB   128GB…
##  3            2 HP      Notebook  15.6   Full HD 1920x1080    Inte… 8GB   256GB…
##  4            3 Apple   Ultrabook 15.4   IPS Panel Retina Di… Inte… 16GB  512GB…
##  5            4 Apple   Ultrabook 13.3   IPS Panel Retina Di… Inte… 8GB   256GB…
##  6            5 Acer    Notebook  15.6   1366x768             AMD … 4GB   500GB…
##  7            6 Apple   Ultrabook 15.4   IPS Panel Retina Di… Inte… 16GB  256GB…
##  8            7 Apple   Ultrabook 13.3   1440x900             Inte… 8GB   256GB…
##  9            8 Asus    Ultrabook 14     Full HD 1920x1080    Inte… 16GB  512GB…
## 10            9 Acer    Ultrabook 14     IPS Panel Full HD 1… Inte… 8GB   256GB…
## # ℹ 1,293 more rows
## # ℹ 4 more variables: Gpu <chr>, OpSys <chr>, Weight <chr>, Price <dbl>

Transform data

#Remove the blank rows
laptop <- laptop %>%
  drop_na()

# Separate out the number and character from Ram, Weight, Memory
ram <- str_sub(laptop$Ram, 1,4)
ram2 <- as.integer(str_extract(ram, "[0-9]+"))
gb <- (str_extract(ram, "[aA-zZ]+"))
weight <-str_sub(laptop$Weight, 1,6)
kg <- (str_extract(weight, "[aA-zZ]+"))
wght <- as.numeric(str_extract(weight, "[0-9]+"))

#Separate out the Memory column into 2 columns then separate out the number and characters
laptop[c("mem", "gb")] <- str_split_fixed(laptop$Memory, " + ", 2)
mem1 <- str_sub(laptop$mem, 1,5)
mem2 <- as.integer(str_extract(mem1, "[0-9]+"))
gb2 <-(str_extract(mem1,"[aA-zZ]+" ))

laptop_data <- data.frame(laptop,ram2, gb, weight, kg, wght, mem2, gb2)
glimpse(laptop_data)
## Rows: 1,273
## Columns: 21
## $ Unnamed..0       <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ Company          <chr> "Apple", "Apple", "HP", "Apple", "Apple", "Acer", "Ap…
## $ TypeName         <chr> "Ultrabook", "Ultrabook", "Notebook", "Ultrabook", "U…
## $ Inches           <chr> "13.3", "13.3", "15.6", "15.4", "13.3", "15.6", "15.4…
## $ ScreenResolution <chr> "IPS Panel Retina Display 2560x1600", "1440x900", "Fu…
## $ Cpu              <chr> "Intel Core i5 2.3GHz", "Intel Core i5 1.8GHz", "Inte…
## $ Ram              <chr> "8GB", "8GB", "8GB", "16GB", "8GB", "4GB", "16GB", "8…
## $ Memory           <chr> "128GB SSD", "128GB Flash Storage", "256GB SSD", "512…
## $ Gpu              <chr> "Intel Iris Plus Graphics 640", "Intel HD Graphics 60…
## $ OpSys            <chr> "macOS", "macOS", "No OS", "macOS", "macOS", "Windows…
## $ Weight           <chr> "1.37kg", "1.34kg", "1.86kg", "1.83kg", "1.37kg", "2.…
## $ Price            <dbl> 71378.68, 47895.52, 30636.00, 135195.34, 96095.81, 21…
## $ mem              <chr> "128GB SSD", "128GB Flash Storage", "256GB SSD", "512…
## $ gb               <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "…
## $ ram2             <int> 8, 8, 8, 16, 8, 4, 16, 8, 16, 8, 4, 4, 16, 4, 8, 8, 8…
## $ gb.1             <chr> "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB",…
## $ weight           <chr> "1.37kg", "1.34kg", "1.86kg", "1.83kg", "1.37kg", "2.…
## $ kg               <chr> "kg", "kg", "kg", "kg", "kg", "kg", "kg", "kg", "kg",…
## $ wght             <dbl> 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 2, 1,…
## $ mem2             <int> 128, 128, 256, 512, 256, 500, 256, 256, 512, 256, 500…
## $ gb2              <chr> "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB",…

Analysis

Determine the summary statistics on price of the laptops

laptop_price <- laptop_data %>%
  group_by(TypeName) %>%
  summarize(mean = mean(Price), median = median(Price), min = min(Price), max = max(Price), na.rm = T) %>%
  arrange(mean)

laptop_price
## # A tibble: 6 × 6
##   TypeName              mean  median    min     max na.rm
##   <chr>                <dbl>   <dbl>  <dbl>   <dbl> <lgl>
## 1 Netbook             34885.  18515.  9271. 101658. TRUE 
## 2 Notebook            41545.  36896. 10443. 261019. TRUE 
## 3 2 in 1 Convertible  68187.  63883. 14652  137995. TRUE 
## 4 Ultrabook           82926.  79867. 26587. 165168  TRUE 
## 5 Gaming              92204.  79215. 37243. 324955. TRUE 
## 6 Workstation        121498. 110018. 72940. 233846. TRUE

Determine which laptop weights the most

laptop_size <- laptop_data %>%
  group_by(TypeName) %>%
  summarize(weight = max(wght), size = max(Inches), avg = mean(Price)) %>%
  arrange(desc(weight))

laptop_size
## # A tibble: 6 × 4
##   TypeName           weight size      avg
##   <chr>               <dbl> <chr>   <dbl>
## 1 Notebook                8 35.6   41545.
## 2 2 in 1 Convertible      4 31.6   68187.
## 3 Gaming                  4 27.3   92204.
## 4 Workstation             3 17.3  121498.
## 5 Netbook                 2 12.5   34885.
## 6 Ultrabook              NA 33.5   82926.

Which laptop has the most Ram for Apple

laptop_ram <- laptop_data %>%
  group_by(Company) %>%
  summarize(ram = max(ram2), ram_min = min(ram2)) %>%
  arrange(desc(ram))

laptop_ram
## # A tibble: 19 × 3
##    Company     ram ram_min
##    <chr>     <int>   <int>
##  1 Asus         64       2
##  2 Dell         64       2
##  3 Lenovo       64       2
##  4 MSI          32       8
##  5 Razer        32       8
##  6 Toshiba      32       4
##  7 Acer         16       1
##  8 Apple        16       4
##  9 Google       16       8
## 10 HP           16       2
## 11 LG           16       8
## 12 Microsoft    16       4
## 13 Samsung      16       4
## 14 Xiaomi       16       8
## 15 Fujitsu       8       4
## 16 Huawei        8       8
## 17 Chuwi         6       4
## 18 Mediacom      4       2
## 19 Vero          4       2

Conclusion

It can be concluded that for the cheapest laptop type is the Netbook and the most expensive is the Workstation. The average cost of the different types of laptop ranges from 34,885 to 121,498. The largest and heaviest laptop is the Notebook with an average price of 41,545. The Company that produces the largest Ram is Lenovo, Dell and Asus at 64 GB.