DATA 607 – Project 2

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.2
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Untidy Data #1: This dataset is considered untidy because the category column contains multiple purchased items in each row, and the amount column includes multiple values within the same row.

To tidy this data, I need to transform it from a wide format to a long format, ensuring that each item and its corresponding price occupy a separate row.

untidy_invoices <- read.csv("8.-Invoices-with-Merged-Categories-and-Merged-Amounts.csv")

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on
## '8.-Invoices-with-Merged-Categories-and-Merged-Amounts.csv'

head(untidy_invoices)

##         Order.ID                                   Category
## 1 CA-2011-167199 Binders | Art | Phones | Fasteners | Paper
## 2 CA-2011-149020                Office Supplies | Furniture
## 3 CA-2011-131905  Office Supplies | Technology | Technology
## 4 CA-2011-127614             Accessories | Tables | Binders
##                                    Amount
## 1 609.98 | 5.48 | 391.98 | 755.96 | 31.12
## 2                            2.98 | 51.94
## 3                  7.2 | 42.0186 | 42.035
## 4                234.45 | 1256.22 | 17.46

# Separate the categories and amounts into multiple rows
tidy_invoices <- untidy_invoices %>%
  separate_rows(Category, sep = "\\|") %>%  # Split category by |
  separate_rows(Amount, sep = "\\|")  %>%   # Split amount by |
   mutate(Amount = as.numeric(Amount),      # Convert amount to numeric
    Amount = round(Amount, 2))  %>%    # Round amount to 2 decimal places
    rename(Price = Amount)                  # Rename Amount to Price
tidy_invoices

## # A tibble: 47 × 3
##    Order.ID       Category    Price
##    <chr>          <chr>       <dbl>
##  1 CA-2011-167199 "Binders " 610.  
##  2 CA-2011-167199 "Binders "   5.48
##  3 CA-2011-167199 "Binders " 392.  
##  4 CA-2011-167199 "Binders " 756.  
##  5 CA-2011-167199 "Binders "  31.1 
##  6 CA-2011-167199 " Art "    610.  
##  7 CA-2011-167199 " Art "      5.48
##  8 CA-2011-167199 " Art "    392.  
##  9 CA-2011-167199 " Art "    756.  
## 10 CA-2011-167199 " Art "     31.1 
## # ℹ 37 more rows

Now I will answer the following question: “What are the top categories of spending in the store, and how much is spent in each category?”

From the spending summary table, it’s clear that the top three categories in the store are binders, art supplies, and fasteners, with the following expenditures:

Binders: $3,302.65 Art Supplies: $1,794.52 Fasteners: $1,794.52

# Clean up the Categories
tidy_invoices <- tidy_invoices %>%
  mutate(Category = str_trim(Category),        # Trim whitespace
         Category = str_to_lower(Category))   # Convert to lower case

# Aggregate spending by category
spending_summary <- tidy_invoices %>%
  group_by(Category) %>%
  summarise(Total_Spent = sum(Price, na.rm = TRUE)) %>%
  arrange(desc(Total_Spent))  # Sort by spending

# View the summary
print(spending_summary)

## # A tibble: 10 × 2
##    Category        Total_Spent
##    <chr>                 <dbl>
##  1 binders              3303. 
##  2 art                  1795. 
##  3 fasteners            1795. 
##  4 paper                1795. 
##  5 phones               1795. 
##  6 accessories          1508. 
##  7 tables               1508. 
##  8 technology            182. 
##  9 office supplies       146. 
## 10 furniture              54.9

# Create a bar plot of total spending by category
ggplot(spending_summary, aes(x = reorder(Category, Total_Spent), y = Total_Spent)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "Total Spending by Category",
       x = "Categories",
       y = "Total Amount Spent") +
  theme_minimal()

# Create a pie chart
ggplot(spending_summary, aes(x = "", y = Total_Spent, fill = Category)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  labs(title = "Spending Distribution by Category") +
  theme_void()

Unitdy Data #2: This data talks about marriage rates for women of different races, education level, income, and employment rates. This data is very wide and untidy with multiple observations in a row.

untidy_women <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/marriage/women.csv")

## New names:
## Rows: 17 Columns: 99
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (98): ...1, year, all_2534, HS_2534, SC_2534, BAp_2534, BAo_2534, GD_25... date
## (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

untidy_women

## # A tibble: 17 × 99
##     ...1  year date       all_2534 HS_2534 SC_2534 BAp_2534 BAo_2534 GD_2534
##    <dbl> <dbl> <date>        <dbl>   <dbl>   <dbl>    <dbl>    <dbl>   <dbl>
##  1     1  1960 1960-01-01   0.0859  0.0750   0.115    0.288    0.288  NA    
##  2     2  1970 1970-01-01   0.100   0.0831   0.125    0.270    0.270  NA    
##  3     3  1980 1980-01-01   0.162   0.122    0.189    0.295    0.295  NA    
##  4     4  1990 1990-01-01   0.244   0.211    0.230    0.332    0.329   0.341
##  5     5  2000 2000-01-01   0.298   0.274    0.277    0.351    0.350   0.352
##  6     6  2001 2001-01-01   0.307   0.289    0.288    0.347    0.349   0.341
##  7     7  2002 2002-01-01   0.311   0.294    0.297    0.345    0.350   0.330
##  8     8  2003 2003-01-01   0.317   0.305    0.294    0.353    0.357   0.344
##  9     9  2004 2004-01-01   0.324   0.321    0.302    0.350    0.355   0.338
## 10    10  2005 2005-01-01   0.336   0.335    0.315    0.357    0.362   0.342
## 11    11  2006 2006-01-01   0.362   0.364    0.344    0.378    0.388   0.355
## 12    12  2007 2007-01-01   0.375   0.377    0.360    0.387    0.397   0.361
## 13    13  2008 2008-01-01   0.386   0.391    0.373    0.394    0.403   0.373
## 14    14  2009 2009-01-01   0.409   0.413    0.397    0.416    0.430   0.382
## 15    15  2010 2010-01-01   0.415   0.422    0.405    0.420    0.432   0.392
## 16    16  2011 2011-01-01   0.430   0.441    0.419    0.430    0.447   0.393
## 17    17  2012 2012-01-01   0.441   0.455    0.431    0.438    0.456   0.399
## # ℹ 90 more variables: White_2534 <dbl>, Black_2534 <dbl>, Hisp_2534 <dbl>,
## #   NE_2534 <dbl>, MA_2534 <dbl>, Midwest_2534 <dbl>, South_2534 <dbl>,
## #   Mountain_2534 <dbl>, Pacific_2534 <dbl>, poor_2534 <dbl>, mid_2534 <dbl>,
## #   rich_2534 <dbl>, all_3544 <dbl>, HS_3544 <dbl>, SC_3544 <dbl>,
## #   BAp_3544 <dbl>, BAo_3544 <dbl>, GD_3544 <dbl>, White_3544 <dbl>,
## #   Black_3544 <dbl>, Hisp_3544 <dbl>, NE_3544 <dbl>, MA_3544 <dbl>,
## #   Midwest_3544 <dbl>, South_3544 <dbl>, Mountain_3544 <dbl>, …

untidy_women <- untidy_women %>%
  select(-1) #remove first column 
  
untidy_women

## # A tibble: 17 × 98
##     year date       all_2534 HS_2534 SC_2534 BAp_2534 BAo_2534 GD_2534
##    <dbl> <date>        <dbl>   <dbl>   <dbl>    <dbl>    <dbl>   <dbl>
##  1  1960 1960-01-01   0.0859  0.0750   0.115    0.288    0.288  NA    
##  2  1970 1970-01-01   0.100   0.0831   0.125    0.270    0.270  NA    
##  3  1980 1980-01-01   0.162   0.122    0.189    0.295    0.295  NA    
##  4  1990 1990-01-01   0.244   0.211    0.230    0.332    0.329   0.341
##  5  2000 2000-01-01   0.298   0.274    0.277    0.351    0.350   0.352
##  6  2001 2001-01-01   0.307   0.289    0.288    0.347    0.349   0.341
##  7  2002 2002-01-01   0.311   0.294    0.297    0.345    0.350   0.330
##  8  2003 2003-01-01   0.317   0.305    0.294    0.353    0.357   0.344
##  9  2004 2004-01-01   0.324   0.321    0.302    0.350    0.355   0.338
## 10  2005 2005-01-01   0.336   0.335    0.315    0.357    0.362   0.342
## 11  2006 2006-01-01   0.362   0.364    0.344    0.378    0.388   0.355
## 12  2007 2007-01-01   0.375   0.377    0.360    0.387    0.397   0.361
## 13  2008 2008-01-01   0.386   0.391    0.373    0.394    0.403   0.373
## 14  2009 2009-01-01   0.409   0.413    0.397    0.416    0.430   0.382
## 15  2010 2010-01-01   0.415   0.422    0.405    0.420    0.432   0.392
## 16  2011 2011-01-01   0.430   0.441    0.419    0.430    0.447   0.393
## 17  2012 2012-01-01   0.441   0.455    0.431    0.438    0.456   0.399
## # ℹ 90 more variables: White_2534 <dbl>, Black_2534 <dbl>, Hisp_2534 <dbl>,
## #   NE_2534 <dbl>, MA_2534 <dbl>, Midwest_2534 <dbl>, South_2534 <dbl>,
## #   Mountain_2534 <dbl>, Pacific_2534 <dbl>, poor_2534 <dbl>, mid_2534 <dbl>,
## #   rich_2534 <dbl>, all_3544 <dbl>, HS_3544 <dbl>, SC_3544 <dbl>,
## #   BAp_3544 <dbl>, BAo_3544 <dbl>, GD_3544 <dbl>, White_3544 <dbl>,
## #   Black_3544 <dbl>, Hisp_3544 <dbl>, NE_3544 <dbl>, MA_3544 <dbl>,
## #   Midwest_3544 <dbl>, South_3544 <dbl>, Mountain_3544 <dbl>, …

Analysis Question: How does the level of education influence marriage rates among women? Specifically, are women with higher education levels more likely to marry compared to those with lower education levels?

After tidying the data, I find that the facet bar chart is the most effective visualization for my analysis. It enables me to explore marriage rates in relation to both year and education level. Interestingly, I’ve observed that education does not appear to significantly influence marriage rates as I initially expected. While there are slight variations among different groups, the differences are minimal.

smaller_dataset <- untidy_women %>%
    select(year, HS_2534, SC_2534, BAp_2534, BAo_2534, GD_2534)
smaller_dataset

## # A tibble: 17 × 6
##     year HS_2534 SC_2534 BAp_2534 BAo_2534 GD_2534
##    <dbl>   <dbl>   <dbl>    <dbl>    <dbl>   <dbl>
##  1  1960  0.0750   0.115    0.288    0.288  NA    
##  2  1970  0.0831   0.125    0.270    0.270  NA    
##  3  1980  0.122    0.189    0.295    0.295  NA    
##  4  1990  0.211    0.230    0.332    0.329   0.341
##  5  2000  0.274    0.277    0.351    0.350   0.352
##  6  2001  0.289    0.288    0.347    0.349   0.341
##  7  2002  0.294    0.297    0.345    0.350   0.330
##  8  2003  0.305    0.294    0.353    0.357   0.344
##  9  2004  0.321    0.302    0.350    0.355   0.338
## 10  2005  0.335    0.315    0.357    0.362   0.342
## 11  2006  0.364    0.344    0.378    0.388   0.355
## 12  2007  0.377    0.360    0.387    0.397   0.361
## 13  2008  0.391    0.373    0.394    0.403   0.373
## 14  2009  0.413    0.397    0.416    0.430   0.382
## 15  2010  0.422    0.405    0.420    0.432   0.392
## 16  2011  0.441    0.419    0.430    0.447   0.393
## 17  2012  0.455    0.431    0.438    0.456   0.399

renamed_dataset <- smaller_dataset %>% 
rename(
        "High School or Less" = HS_2534,
        "Some College" = SC_2534,
        "Bachelor or More" = BAp_2534,
        "Bachelor No Graduate" = BAo_2534,
        "Graduate Degree" = GD_2534
    )
tidy_dataset <- renamed_dataset %>%
    pivot_longer(
        cols = c(`High School or Less`, `Some College`, `Bachelor or More`, `Bachelor No Graduate`, `Graduate Degree`),
        names_to = "Education Level",
        values_to = "Count"
    )
tidy_dataset

## # A tibble: 85 × 3
##     year `Education Level`      Count
##    <dbl> <chr>                  <dbl>
##  1  1960 High School or Less   0.0750
##  2  1960 Some College          0.115 
##  3  1960 Bachelor or More      0.288 
##  4  1960 Bachelor No Graduate  0.288 
##  5  1960 Graduate Degree      NA     
##  6  1970 High School or Less   0.0831
##  7  1970 Some College          0.125 
##  8  1970 Bachelor or More      0.270 
##  9  1970 Bachelor No Graduate  0.270 
## 10  1970 Graduate Degree      NA     
## # ℹ 75 more rows

sorted_tidy_dataset <- tidy_dataset %>%
    arrange(desc(Count))
sorted_tidy_dataset

## # A tibble: 85 × 3
##     year `Education Level`    Count
##    <dbl> <chr>                <dbl>
##  1  2012 Bachelor No Graduate 0.456
##  2  2012 High School or Less  0.455
##  3  2011 Bachelor No Graduate 0.447
##  4  2011 High School or Less  0.441
##  5  2012 Bachelor or More     0.438
##  6  2010 Bachelor No Graduate 0.432
##  7  2012 Some College         0.431
##  8  2009 Bachelor No Graduate 0.430
##  9  2011 Bachelor or More     0.430
## 10  2010 High School or Less  0.422
## # ℹ 75 more rows

#A bar chart to visualize marriage by education level 
ggplot(tidy_dataset, aes(x = `Education Level`, y = Count, fill = `Education Level`)) +
    geom_bar(stat = "identity") +
    labs(title = "Counts by Education Level", x = "Education Level", y = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#A facet bar chart to visualize marriage by year and education level
ggplot(tidy_dataset, aes(x = `Education Level`, y = Count, fill = `Education Level`)) +
    geom_bar(stat = "identity") +
    facet_wrap(~ year) +
    labs(title = "Counts by Year", x = "Education Level", y = "Count") +
    theme_minimal() +
    theme(
        axis.text.x = element_blank(),  # Remove x-axis text
        axis.ticks.x = element_blank()   # Remove x-axis ticks
    )

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#A heatmap to visualize marriages in a matrix format, a heatmap can show variations in counts across education levels and years.
ggplot(tidy_dataset, aes(x = `Education Level`, y = year, fill = Count)) +
    geom_tile(color = "white") +
    scale_fill_gradient(low = "white", high = "blue") +
    labs(title = "Heatmap of Counts by Education Level and Year", x = "Education Level", y = "Year") +
    theme_minimal()

Untidy Data #3

This data set has a few issues making it untidy. 1. Some column headers are values, not variable names. 2. Variables are stored in both rows and cloumns. 3. Multiple variables are store in on column. 4. Multiple observations are stored in the same table.

url <- "https://gist.githubusercontent.com/Kimmirikwa/b69d0ea134820ea52f8481991ffae93e/raw/4db7b1698035ee29885d10e1a59bd902716ae168/student_results.csv"
untidydata1 <- read_csv("https://gist.githubusercontent.com/Kimmirikwa/b69d0ea134820ea52f8481991ffae93e/raw/4db7b1698035ee29885d10e1a59bd902716ae168/student_results.csv")

## Rows: 10 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, sex and age, test number
## dbl (5): id, phone, term 1, term 2, term 3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(untidydata1)

## # A tibble: 10 × 8
##       id name   phone `sex and age` `test number` `term 1` `term 2` `term 3`
##    <dbl> <chr>  <dbl> <chr>         <chr>            <dbl>    <dbl>    <dbl>
##  1     1 Mike     134 m_12          test 1              76       84       87
##  2     2 Linda    270 f_13          test 1              88       90       73
##  3     3 Sam      210 m_11          test 1              78       74       80
##  4     4 Esther   617 f_12          test 1              68       75       74
##  5     5 Mary     114 f_14          test 1              65       67       64
##  6     1 Mike     134 m_12          test 2              85       80       90
##  7     2 Linda    270 f_13          test 2              87       82       94
##  8     3 Sam      210 m_11          test 2              80       87       80
##  9     4 Esther   617 f_12          test 2              70       75       78
## 10     5 Mary     114 f_14          test 2              68       70       63

First, I will split the table into studentdata and studentperformance. I will first work on studentdata and then student performance below.

studentdata <- untidydata1 %>%
  select(id, name, phone, `sex and age`) 
print(studentdata)

## # A tibble: 10 × 4
##       id name   phone `sex and age`
##    <dbl> <chr>  <dbl> <chr>        
##  1     1 Mike     134 m_12         
##  2     2 Linda    270 f_13         
##  3     3 Sam      210 m_11         
##  4     4 Esther   617 f_12         
##  5     5 Mary     114 f_14         
##  6     1 Mike     134 m_12         
##  7     2 Linda    270 f_13         
##  8     3 Sam      210 m_11         
##  9     4 Esther   617 f_12         
## 10     5 Mary     114 f_14

studentperformance <- untidydata1 %>%
  select(id, `test number`, `term 1`, `term 2`, `term 3`)
print(studentperformance)

## # A tibble: 10 × 5
##       id `test number` `term 1` `term 2` `term 3`
##    <dbl> <chr>            <dbl>    <dbl>    <dbl>
##  1     1 test 1              76       84       87
##  2     2 test 1              88       90       73
##  3     3 test 1              78       74       80
##  4     4 test 1              68       75       74
##  5     5 test 1              65       67       64
##  6     1 test 2              85       80       90
##  7     2 test 2              87       82       94
##  8     3 test 2              80       87       80
##  9     4 test 2              70       75       78
## 10     5 test 2              68       70       63

#studentdata has sex and age together. I need to split this up into separate columns 

studentdata <- studentdata %>%
  separate(`sex and age`, into = c("sex", "age"), sep = "_", convert = TRUE) %>%
  mutate(sex = recode(sex, m = "male", f = "female")) 

print(studentdata)

## # A tibble: 10 × 5
##       id name   phone sex      age
##    <dbl> <chr>  <dbl> <chr>  <int>
##  1     1 Mike     134 male      12
##  2     2 Linda    270 female    13
##  3     3 Sam      210 male      11
##  4     4 Esther   617 female    12
##  5     5 Mary     114 female    14
##  6     1 Mike     134 male      12
##  7     2 Linda    270 female    13
##  8     3 Sam      210 male      11
##  9     4 Esther   617 female    12
## 10     5 Mary     114 female    14

tidystudentdata <- studentdata %>%
  distinct() #removes repeating rows
print(tidystudentdata)

## # A tibble: 5 × 5
##      id name   phone sex      age
##   <dbl> <chr>  <dbl> <chr>  <int>
## 1     1 Mike     134 male      12
## 2     2 Linda    270 female    13
## 3     3 Sam      210 male      11
## 4     4 Esther   617 female    12
## 5     5 Mary     114 female    14

studentperformance_long <- studentperformance %>%
  pivot_longer(
    cols = starts_with("term"),  # Select columns that start with "term"
    names_to = "term",            # New column name for terms
    values_to = "grade"           # New column name for grades
  )
print(studentperformance_long)

## # A tibble: 30 × 4
##       id `test number` term   grade
##    <dbl> <chr>         <chr>  <dbl>
##  1     1 test 1        term 1    76
##  2     1 test 1        term 2    84
##  3     1 test 1        term 3    87
##  4     2 test 1        term 1    88
##  5     2 test 1        term 2    90
##  6     2 test 1        term 3    73
##  7     3 test 1        term 1    78
##  8     3 test 1        term 2    74
##  9     3 test 1        term 3    80
## 10     4 test 1        term 1    68
## # ℹ 20 more rows

Analysis Question: Do certain tests consistently show higher or lower performance across terms?

I used several visualizations to analyze the data as seen below. The line graph suggests that there are significant differences in grades for each test across the three terms; however, these differences are not as pronounced as they initially appear.

In contrast, the facet bar chart reveals that the scores are quite similar, with only slight variations between them. Given this minimal difference, we cannot conclusively determine that any test consistently demonstrates higher or lower performance.”

summary_performance <- studentperformance_long %>%
    group_by(`test number`) %>%
    summarise(Average_Grade = mean(grade, na.rm = TRUE), .groups = 'drop')

print(summary_performance)

## # A tibble: 2 × 2
##   `test number` Average_Grade
##   <chr>                 <dbl>
## 1 test 1                 76.2
## 2 test 2                 79.3

ggplot(summary_performance, aes(x = reorder(`test number`, -Average_Grade), y = Average_Grade, fill = `test number`)) +
    geom_bar(stat = "identity") +
    labs(title = "Average Grades by Test",
         x = "Test Number",
         y = "Average Grade") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Summarize data by term and test number
summary_performance_terms <- studentperformance_long %>%
    group_by(term, `test number`) %>%
    summarise(Average_Grade = mean(grade, na.rm = TRUE), .groups = 'drop')

# Line chart to see how the performance of each test varies across terms
ggplot(summary_performance_terms, aes(x = term, y = Average_Grade, group = `test number`, color = `test number`)) +
    geom_line() +
    geom_point() +
    labs(title = "Average Grades by Test Across Terms",
         x = "Term",
         y = "Average Grade") +
    theme_minimal()

# Facet plot to compare the average grades of each test across terms:
ggplot(summary_performance_terms, aes(x = term, y = Average_Grade, fill = `test number`)) +
    geom_bar(stat = "identity", position = position_dodge()) +
    facet_wrap(~ `test number`) +
    labs(title = "Average Grades by Test Across Terms",
         x = "Term",
         y = "Average Grade") +
    theme_minimal()

DATA 607 – Project 2

Leslie Tavarez

2024-10-13