library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)

The data is taken from Pro Football Reference and it is in wide format. Even though the dataset has win-loss column but it doesnt clarifies what it means. So I’m cleaning the column with correct values and changing the column names for clarification.

uncleaned_nfl <- read.csv("https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/project2_cleaning_data/unclean_nfl.csv")
uncleaned_nfl
##                      Tm W L T  W.L.  PF  PA  PD  MoV  SoS   SRS  OSRS DSRS
## 1         Buffalo Bills 3 1 0 0.750 114  58  56 14.0  2.8  16.8   4.9 11.9
## 2        Miami Dolphins 3 1 0 0.750  98  91   7  1.8  8.5  10.2   6.8  3.4
## 3         New York Jets 2 2 0 0.500  76 101 -25 -6.3  3.1  -3.1  -3.6  0.5
## 4  New England Patriots 1 3 0 0.250  74  98 -24 -6.0  5.3  -0.7  -2.7  2.0
## 5      Baltimore Ravens 2 2 0 0.500 119 100  19  4.8  5.8  10.5  12.4 -1.9
## 6    Cincinnati Bengals 2 2 0 0.500  91  70  21  5.3  2.0   7.2   2.9  4.4
## 7      Cleveland Browns 2 2 0 0.500 105  95  10  2.5 -4.2  -1.7   3.6 -5.3
## 8   Pittsburgh Steelers 1 3 0 0.250  74  90 -16 -4.0  0.4  -3.6  -2.9 -0.7
## 9  Jacksonville Jaguars 2 2 0 0.500 105  67  38  9.5  1.6  11.1   9.1  2.0
## 10     Tennessee Titans 2 2 0 0.500  75 101 -26 -6.5  0.0  -6.5  -0.6 -5.9
## 11   Indianapolis Colts 2 2 1 0.500  69  94 -25 -5.0 -1.6  -6.6 -11.9  5.3
## 12       Houston Texans 0 3 1 0.125  73  93 -20 -5.0 -6.2 -11.2  -1.6 -9.6
## 13   Kansas City Chiefs 3 1 0 0.750 129  96  33  8.3 -2.8   5.5  13.8 -8.3
## 14 Los Angeles Chargers 2 2 0 0.500  92 108 -16 -4.0 -0.5  -4.5  -4.8  0.3
## 15       Denver Broncos 2 3 0 0.400  75  80  -5 -1.0 -6.6  -7.6  -9.6  1.9
## 16    Las Vegas Raiders 1 3 0 0.250  96 100  -4 -1.0 -6.4  -7.4   0.9 -8.2

In this section I am renaming all the variable name so that it is easier to understand the columns

colnames(uncleaned_nfl) <- c('teams', 'win', 'loss', 'tie', 'win-loss', 'teamScore', 'opponentScore', 'scoreDiff', 'marginVictory', 'strengthScheduel', 'simpleRatingSys', 'offensiveSRS', 'defensiveSRS')

I have added few more columns like total_game and win_percent with correct representation. Select(-c(win-loss)) dropping the column win-loss from the original dataset select(teams, win, loss, tie, total_game, win_percent, everything()): over here I’m rearranging the column order

clean_nfl <- uncleaned_nfl %>%
  mutate(total_game = win + loss) %>%
  mutate(win_percent = (win/total_game)*100) %>%
  select(-c(`win-loss`)) %>%
  select(teams, win, loss, tie, total_game, win_percent, everything())

clean_nfl
##                   teams win loss tie total_game win_percent teamScore
## 1         Buffalo Bills   3    1   0          4          75       114
## 2        Miami Dolphins   3    1   0          4          75        98
## 3         New York Jets   2    2   0          4          50        76
## 4  New England Patriots   1    3   0          4          25        74
## 5      Baltimore Ravens   2    2   0          4          50       119
## 6    Cincinnati Bengals   2    2   0          4          50        91
## 7      Cleveland Browns   2    2   0          4          50       105
## 8   Pittsburgh Steelers   1    3   0          4          25        74
## 9  Jacksonville Jaguars   2    2   0          4          50       105
## 10     Tennessee Titans   2    2   0          4          50        75
## 11   Indianapolis Colts   2    2   1          4          50        69
## 12       Houston Texans   0    3   1          3           0        73
## 13   Kansas City Chiefs   3    1   0          4          75       129
## 14 Los Angeles Chargers   2    2   0          4          50        92
## 15       Denver Broncos   2    3   0          5          40        75
## 16    Las Vegas Raiders   1    3   0          4          25        96
##    opponentScore scoreDiff marginVictory strengthScheduel simpleRatingSys
## 1             58        56          14.0              2.8            16.8
## 2             91         7           1.8              8.5            10.2
## 3            101       -25          -6.3              3.1            -3.1
## 4             98       -24          -6.0              5.3            -0.7
## 5            100        19           4.8              5.8            10.5
## 6             70        21           5.3              2.0             7.2
## 7             95        10           2.5             -4.2            -1.7
## 8             90       -16          -4.0              0.4            -3.6
## 9             67        38           9.5              1.6            11.1
## 10           101       -26          -6.5              0.0            -6.5
## 11            94       -25          -5.0             -1.6            -6.6
## 12            93       -20          -5.0             -6.2           -11.2
## 13            96        33           8.3             -2.8             5.5
## 14           108       -16          -4.0             -0.5            -4.5
## 15            80        -5          -1.0             -6.6            -7.6
## 16           100        -4          -1.0             -6.4            -7.4
##    offensiveSRS defensiveSRS
## 1           4.9         11.9
## 2           6.8          3.4
## 3          -3.6          0.5
## 4          -2.7          2.0
## 5          12.4         -1.9
## 6           2.9          4.4
## 7           3.6         -5.3
## 8          -2.9         -0.7
## 9           9.1          2.0
## 10         -0.6         -5.9
## 11        -11.9          5.3
## 12         -1.6         -9.6
## 13         13.8         -8.3
## 14         -4.8          0.3
## 15         -9.6          1.9
## 16          0.9         -8.2

Analysis

I used geom_col to graph the data and the visualization represents the maximum and minumum win% of a team.

ggplot(clean_nfl, aes(x= teams, y = win_percent)) + 
  geom_col(position = 'dodge') + 
  #scale_y_continuous(labels = percent) + 
  coord_flip()

### Student Records

A dataset of students with their score in each term. The dataframe is in wider format and there are empty rows which needs to be deleted.

uncleaned_student_records <- read.csv('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/project2_cleaning_data/unclean_student_records.csv')

in this section, column sex_age is splitted into two column, which are sex and age, with their respective values. Also, gathering the term.1, term.2 and term.3 column into single column called term and its values are stored in column score

student_exam_df <- uncleaned_student_records |>
  drop_na() |> 
  separate(sex.and.age, into = c('sex', 'age'), sep = '_') |> 
  pivot_longer(cols = c('term.1', 'term.2', 'term.3'), names_to = 'term', values_to = 'score')

In this section, I am extracting all on the numbers of the columns and avoiding the characters.

student_exam_df$term <- str_extract(student_exam_df$term, '\\d')
student_exam_df$test.number <- str_extract(student_exam_df$test.number, '\\d')

student_exam_df
## # A tibble: 60 × 8
##       id name   phone sex   age   test.number term  score
##    <int> <chr>  <int> <chr> <chr> <chr>       <chr> <int>
##  1     1 Mike     134 m     12    1           1        76
##  2     1 Mike     134 m     12    1           2        84
##  3     1 Mike     134 m     12    1           3        87
##  4     2 Linda    270 f     13    1           1        88
##  5     2 Linda    270 f     13    1           2        90
##  6     2 Linda    270 f     13    1           3        73
##  7     3 Sam      210 m     11    1           1        78
##  8     3 Sam      210 m     11    1           2        74
##  9     3 Sam      210 m     11    1           3        80
## 10     4 Esther   617 f     12    1           1        68
## # … with 50 more rows
## # ℹ Use `print(n = ...)` to see more rows

in this section, I have averaged the score of student’s name and term. I have used .groups = ‘drops’ because in my version of R, 4.2.1, it looks like multiple grouping is not supported.

avg_df <- student_exam_df |> 
  group_by(name, term) |>
  summarise(avg = mean(score), .groups = 'drop')

head(avg_df)
## # A tibble: 6 × 3
##   name   term    avg
##   <chr>  <chr> <dbl>
## 1 Dan    1      84.5
## 2 Dan    2      74  
## 3 Dan    3      89.5
## 4 Esther 1      69  
## 5 Esther 2      75  
## 6 Esther 3      76

Admission Records based on gender

uncleaned_admission_df <- read.csv('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/project2_cleaning_data/unclean_admission.csv')

uncleaned_admission_df
##    Gender Dept Admitted Rejected
## 1    Male    A      512      313
## 2  Female    A       89       19
## 3    Male    B      353      207
## 4  Female    B       17        8
## 5    Male    C      120      205
## 6  Female    C      202      391
## 7    Male    D      138      279
## 8  Female    D      131      244
## 9    Male    E       53      138
## 10 Female    E       94      299
## 11   Male    F       22      351
## 12 Female    F       24      317
admin_gender_df <- uncleaned_admission_df |> 
  pivot_longer(cols = c('Admitted', 'Rejected'), names_to = 'status', values_to = 'total')

admin_gender_df
## # A tibble: 24 × 4
##    Gender Dept  status   total
##    <chr>  <chr> <chr>    <int>
##  1 Male   A     Admitted   512
##  2 Male   A     Rejected   313
##  3 Female A     Admitted    89
##  4 Female A     Rejected    19
##  5 Male   B     Admitted   353
##  6 Male   B     Rejected   207
##  7 Female B     Admitted    17
##  8 Female B     Rejected     8
##  9 Male   C     Admitted   120
## 10 Male   C     Rejected   205
## # … with 14 more rows
## # ℹ Use `print(n = ...)` to see more rows
admin_gender_df |> 
  group_by(Gender) |>
  summarise(total_application = sum(total))
## # A tibble: 2 × 2
##   Gender total_application
##   <chr>              <int>
## 1 Female              1835
## 2 Male                2691
ggplot(admin_gender_df, aes(x=status, y = total, fill=Gender)) + 
  geom_col(position = 'dodge')+ 
  coord_flip()