library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
The data is taken from Pro Football Reference and it is in wide format. Even though the dataset has win-loss column but it doesnt clarifies what it means. So I’m cleaning the column with correct values and changing the column names for clarification.
uncleaned_nfl <- read.csv("https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/project2_cleaning_data/unclean_nfl.csv")
uncleaned_nfl
## Tm W L T W.L. PF PA PD MoV SoS SRS OSRS DSRS
## 1 Buffalo Bills 3 1 0 0.750 114 58 56 14.0 2.8 16.8 4.9 11.9
## 2 Miami Dolphins 3 1 0 0.750 98 91 7 1.8 8.5 10.2 6.8 3.4
## 3 New York Jets 2 2 0 0.500 76 101 -25 -6.3 3.1 -3.1 -3.6 0.5
## 4 New England Patriots 1 3 0 0.250 74 98 -24 -6.0 5.3 -0.7 -2.7 2.0
## 5 Baltimore Ravens 2 2 0 0.500 119 100 19 4.8 5.8 10.5 12.4 -1.9
## 6 Cincinnati Bengals 2 2 0 0.500 91 70 21 5.3 2.0 7.2 2.9 4.4
## 7 Cleveland Browns 2 2 0 0.500 105 95 10 2.5 -4.2 -1.7 3.6 -5.3
## 8 Pittsburgh Steelers 1 3 0 0.250 74 90 -16 -4.0 0.4 -3.6 -2.9 -0.7
## 9 Jacksonville Jaguars 2 2 0 0.500 105 67 38 9.5 1.6 11.1 9.1 2.0
## 10 Tennessee Titans 2 2 0 0.500 75 101 -26 -6.5 0.0 -6.5 -0.6 -5.9
## 11 Indianapolis Colts 2 2 1 0.500 69 94 -25 -5.0 -1.6 -6.6 -11.9 5.3
## 12 Houston Texans 0 3 1 0.125 73 93 -20 -5.0 -6.2 -11.2 -1.6 -9.6
## 13 Kansas City Chiefs 3 1 0 0.750 129 96 33 8.3 -2.8 5.5 13.8 -8.3
## 14 Los Angeles Chargers 2 2 0 0.500 92 108 -16 -4.0 -0.5 -4.5 -4.8 0.3
## 15 Denver Broncos 2 3 0 0.400 75 80 -5 -1.0 -6.6 -7.6 -9.6 1.9
## 16 Las Vegas Raiders 1 3 0 0.250 96 100 -4 -1.0 -6.4 -7.4 0.9 -8.2
In this section I am renaming all the variable name so that it is easier to understand the columns
colnames(uncleaned_nfl) <- c('teams', 'win', 'loss', 'tie', 'win-loss', 'teamScore', 'opponentScore', 'scoreDiff', 'marginVictory', 'strengthScheduel', 'simpleRatingSys', 'offensiveSRS', 'defensiveSRS')
I have added few more columns like total_game and win_percent with
correct representation.
Select(-c(win-loss)) dropping the column
win-loss from the original dataset select(teams, win, loss, tie,
total_game, win_percent, everything()): over here I’m
rearranging the column order
clean_nfl <- uncleaned_nfl %>%
mutate(total_game = win + loss) %>%
mutate(win_percent = (win/total_game)*100) %>%
select(-c(`win-loss`)) %>%
select(teams, win, loss, tie, total_game, win_percent, everything())
clean_nfl
## teams win loss tie total_game win_percent teamScore
## 1 Buffalo Bills 3 1 0 4 75 114
## 2 Miami Dolphins 3 1 0 4 75 98
## 3 New York Jets 2 2 0 4 50 76
## 4 New England Patriots 1 3 0 4 25 74
## 5 Baltimore Ravens 2 2 0 4 50 119
## 6 Cincinnati Bengals 2 2 0 4 50 91
## 7 Cleveland Browns 2 2 0 4 50 105
## 8 Pittsburgh Steelers 1 3 0 4 25 74
## 9 Jacksonville Jaguars 2 2 0 4 50 105
## 10 Tennessee Titans 2 2 0 4 50 75
## 11 Indianapolis Colts 2 2 1 4 50 69
## 12 Houston Texans 0 3 1 3 0 73
## 13 Kansas City Chiefs 3 1 0 4 75 129
## 14 Los Angeles Chargers 2 2 0 4 50 92
## 15 Denver Broncos 2 3 0 5 40 75
## 16 Las Vegas Raiders 1 3 0 4 25 96
## opponentScore scoreDiff marginVictory strengthScheduel simpleRatingSys
## 1 58 56 14.0 2.8 16.8
## 2 91 7 1.8 8.5 10.2
## 3 101 -25 -6.3 3.1 -3.1
## 4 98 -24 -6.0 5.3 -0.7
## 5 100 19 4.8 5.8 10.5
## 6 70 21 5.3 2.0 7.2
## 7 95 10 2.5 -4.2 -1.7
## 8 90 -16 -4.0 0.4 -3.6
## 9 67 38 9.5 1.6 11.1
## 10 101 -26 -6.5 0.0 -6.5
## 11 94 -25 -5.0 -1.6 -6.6
## 12 93 -20 -5.0 -6.2 -11.2
## 13 96 33 8.3 -2.8 5.5
## 14 108 -16 -4.0 -0.5 -4.5
## 15 80 -5 -1.0 -6.6 -7.6
## 16 100 -4 -1.0 -6.4 -7.4
## offensiveSRS defensiveSRS
## 1 4.9 11.9
## 2 6.8 3.4
## 3 -3.6 0.5
## 4 -2.7 2.0
## 5 12.4 -1.9
## 6 2.9 4.4
## 7 3.6 -5.3
## 8 -2.9 -0.7
## 9 9.1 2.0
## 10 -0.6 -5.9
## 11 -11.9 5.3
## 12 -1.6 -9.6
## 13 13.8 -8.3
## 14 -4.8 0.3
## 15 -9.6 1.9
## 16 0.9 -8.2
I used geom_col to graph the data and the visualization represents the maximum and minumum win% of a team.
ggplot(clean_nfl, aes(x= teams, y = win_percent)) +
geom_col(position = 'dodge') +
#scale_y_continuous(labels = percent) +
coord_flip()
### Student Records
A dataset of students with their score in each term. The dataframe is in wider format and there are empty rows which needs to be deleted.
uncleaned_student_records <- read.csv('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/project2_cleaning_data/unclean_student_records.csv')
in this section, column sex_age is splitted into two column, which are sex and age, with their respective values. Also, gathering the term.1, term.2 and term.3 column into single column called term and its values are stored in column score
student_exam_df <- uncleaned_student_records |>
drop_na() |>
separate(sex.and.age, into = c('sex', 'age'), sep = '_') |>
pivot_longer(cols = c('term.1', 'term.2', 'term.3'), names_to = 'term', values_to = 'score')
In this section, I am extracting all on the numbers of the columns and avoiding the characters.
student_exam_df$term <- str_extract(student_exam_df$term, '\\d')
student_exam_df$test.number <- str_extract(student_exam_df$test.number, '\\d')
student_exam_df
## # A tibble: 60 × 8
## id name phone sex age test.number term score
## <int> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 1 Mike 134 m 12 1 1 76
## 2 1 Mike 134 m 12 1 2 84
## 3 1 Mike 134 m 12 1 3 87
## 4 2 Linda 270 f 13 1 1 88
## 5 2 Linda 270 f 13 1 2 90
## 6 2 Linda 270 f 13 1 3 73
## 7 3 Sam 210 m 11 1 1 78
## 8 3 Sam 210 m 11 1 2 74
## 9 3 Sam 210 m 11 1 3 80
## 10 4 Esther 617 f 12 1 1 68
## # … with 50 more rows
## # ℹ Use `print(n = ...)` to see more rows
in this section, I have averaged the score of student’s name and term. I have used .groups = ‘drops’ because in my version of R, 4.2.1, it looks like multiple grouping is not supported.
avg_df <- student_exam_df |>
group_by(name, term) |>
summarise(avg = mean(score), .groups = 'drop')
head(avg_df)
## # A tibble: 6 × 3
## name term avg
## <chr> <chr> <dbl>
## 1 Dan 1 84.5
## 2 Dan 2 74
## 3 Dan 3 89.5
## 4 Esther 1 69
## 5 Esther 2 75
## 6 Esther 3 76
uncleaned_admission_df <- read.csv('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/project2_cleaning_data/unclean_admission.csv')
uncleaned_admission_df
## Gender Dept Admitted Rejected
## 1 Male A 512 313
## 2 Female A 89 19
## 3 Male B 353 207
## 4 Female B 17 8
## 5 Male C 120 205
## 6 Female C 202 391
## 7 Male D 138 279
## 8 Female D 131 244
## 9 Male E 53 138
## 10 Female E 94 299
## 11 Male F 22 351
## 12 Female F 24 317
admin_gender_df <- uncleaned_admission_df |>
pivot_longer(cols = c('Admitted', 'Rejected'), names_to = 'status', values_to = 'total')
admin_gender_df
## # A tibble: 24 × 4
## Gender Dept status total
## <chr> <chr> <chr> <int>
## 1 Male A Admitted 512
## 2 Male A Rejected 313
## 3 Female A Admitted 89
## 4 Female A Rejected 19
## 5 Male B Admitted 353
## 6 Male B Rejected 207
## 7 Female B Admitted 17
## 8 Female B Rejected 8
## 9 Male C Admitted 120
## 10 Male C Rejected 205
## # … with 14 more rows
## # ℹ Use `print(n = ...)` to see more rows
admin_gender_df |>
group_by(Gender) |>
summarise(total_application = sum(total))
## # A tibble: 2 × 2
## Gender total_application
## <chr> <int>
## 1 Female 1835
## 2 Male 2691
ggplot(admin_gender_df, aes(x=status, y = total, fill=Gender)) +
geom_col(position = 'dodge')+
coord_flip()