# pasted from a pdf file - cleaning the data
grade_url <- "https://raw.githubusercontent.com/emrahakin1985/DATA607/master/datasets/Salary_Increase.csv"

grade <- read.csv(grade_url, stringsAsFactors = F)
grade
##                   Grade.Step.1.Step.2.Step.3.Step.4.Step.5.Step.6.Step.7.Step.8.Step.9.Step.10
## 1  1 $ 24,019 $ 24,823 $ 25,621 $ 26,416 $ 27,215 $ 27,682 $ 28,471 $ 29,268 $ 29,300 $ 30,041
## 2                      2 27,005 27,648 28,542 29,300 29,629 30,500 31,371 32,243 33,114 33,985
## 3                      3 29,466 30,448 31,431 32,414 33,397 34,379 35,362 36,345 37,328 38,310
## 4                      4 33,079 34,181 35,283 36,385 37,487 38,589 39,691 40,793 41,895 42,997
## 5                      5 37,009 38,243 39,477 40,711 41,946 43,180 44,414 45,648 46,883 48,117
## 6                      6 41,253 42,629 44,005 45,380 46,756 48,131 49,507 50,882 52,258 53,634
## 7                      7 45,843 47,372 48,900 50,429 51,957 53,486 55,014 56,543 58,072 59,600
## 8                      8 50,770 52,462 54,154 55,845 57,537 59,229 60,921 62,613 64,305 65,997
## 9                      9 56,075 57,944 59,814 61,684 63,553 65,423 67,292 69,162 71,031 72,901
## 10                    10 61,752 63,811 65,870 67,929 69,988 72,047 74,105 76,164 78,223 80,282
## 11                    11 67,845 70,106 72,367 74,628 76,889 79,150 81,411 83,672 85,933 88,194
## 12                 12 81,319 84,030 86,741 89,452 92,163 94,874 97,585 100,296 103,007 105,718
## 13            13 96,698 99,921 103,144 106,367 109,591 112,814 116,037 119,260 122,483 125,706
## 14          14 114,268 118,077 121,887 125,696 129,505 133,314 137,123 140,932 144,741 148,550
## 15   15 134,411 138,891 143,372 147,853 152,334 156,814 161,295 161,900 * 161,900 * 161,900 *
a <- str_replace_all(unlist(str_extract_all(unlist(grade), ".+")), "\\$", "")
a
##  [1] "1  24,019  24,823  25,621  26,416  27,215  27,682  28,471  29,268  29,300  30,041"        
##  [2] "2 27,005 27,648 28,542 29,300 29,629 30,500 31,371 32,243 33,114 33,985"                  
##  [3] "3 29,466 30,448 31,431 32,414 33,397 34,379 35,362 36,345 37,328 38,310"                  
##  [4] "4 33,079 34,181 35,283 36,385 37,487 38,589 39,691 40,793 41,895 42,997"                  
##  [5] "5 37,009 38,243 39,477 40,711 41,946 43,180 44,414 45,648 46,883 48,117"                  
##  [6] "6 41,253 42,629 44,005 45,380 46,756 48,131 49,507 50,882 52,258 53,634"                  
##  [7] "7 45,843 47,372 48,900 50,429 51,957 53,486 55,014 56,543 58,072 59,600"                  
##  [8] "8 50,770 52,462 54,154 55,845 57,537 59,229 60,921 62,613 64,305 65,997"                  
##  [9] "9 56,075 57,944 59,814 61,684 63,553 65,423 67,292 69,162 71,031 72,901"                  
## [10] "10 61,752 63,811 65,870 67,929 69,988 72,047 74,105 76,164 78,223 80,282"                 
## [11] "11 67,845 70,106 72,367 74,628 76,889 79,150 81,411 83,672 85,933 88,194"                 
## [12] "12 81,319 84,030 86,741 89,452 92,163 94,874 97,585 100,296 103,007 105,718"              
## [13] "13 96,698 99,921 103,144 106,367 109,591 112,814 116,037 119,260 122,483 125,706"         
## [14] "14 114,268 118,077 121,887 125,696 129,505 133,314 137,123 140,932 144,741 148,550"       
## [15] "15 134,411 138,891 143,372 147,853 152,334 156,814 161,295 161,900 * 161,900 * 161,900 * "
v <- unlist(str_extract_all(a, "([0-9]){2,3},([0-9]){3}"))
v
##   [1] "24,019"  "24,823"  "25,621"  "26,416"  "27,215"  "27,682"  "28,471" 
##   [8] "29,268"  "29,300"  "30,041"  "27,005"  "27,648"  "28,542"  "29,300" 
##  [15] "29,629"  "30,500"  "31,371"  "32,243"  "33,114"  "33,985"  "29,466" 
##  [22] "30,448"  "31,431"  "32,414"  "33,397"  "34,379"  "35,362"  "36,345" 
##  [29] "37,328"  "38,310"  "33,079"  "34,181"  "35,283"  "36,385"  "37,487" 
##  [36] "38,589"  "39,691"  "40,793"  "41,895"  "42,997"  "37,009"  "38,243" 
##  [43] "39,477"  "40,711"  "41,946"  "43,180"  "44,414"  "45,648"  "46,883" 
##  [50] "48,117"  "41,253"  "42,629"  "44,005"  "45,380"  "46,756"  "48,131" 
##  [57] "49,507"  "50,882"  "52,258"  "53,634"  "45,843"  "47,372"  "48,900" 
##  [64] "50,429"  "51,957"  "53,486"  "55,014"  "56,543"  "58,072"  "59,600" 
##  [71] "50,770"  "52,462"  "54,154"  "55,845"  "57,537"  "59,229"  "60,921" 
##  [78] "62,613"  "64,305"  "65,997"  "56,075"  "57,944"  "59,814"  "61,684" 
##  [85] "63,553"  "65,423"  "67,292"  "69,162"  "71,031"  "72,901"  "61,752" 
##  [92] "63,811"  "65,870"  "67,929"  "69,988"  "72,047"  "74,105"  "76,164" 
##  [99] "78,223"  "80,282"  "67,845"  "70,106"  "72,367"  "74,628"  "76,889" 
## [106] "79,150"  "81,411"  "83,672"  "85,933"  "88,194"  "81,319"  "84,030" 
## [113] "86,741"  "89,452"  "92,163"  "94,874"  "97,585"  "100,296" "103,007"
## [120] "105,718" "96,698"  "99,921"  "103,144" "106,367" "109,591" "112,814"
## [127] "116,037" "119,260" "122,483" "125,706" "114,268" "118,077" "121,887"
## [134] "125,696" "129,505" "133,314" "137,123" "140,932" "144,741" "148,550"
## [141] "134,411" "138,891" "143,372" "147,853" "152,334" "156,814" "161,295"
## [148] "161,900" "161,900" "161,900"


Creating the data frame:

df <- matrix(v, nrow = 10, ncol = 15)
df <- data.frame(t(df))
knitr::kable(df)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
24,019 24,823 25,621 26,416 27,215 27,682 28,471 29,268 29,300 30,041
27,005 27,648 28,542 29,300 29,629 30,500 31,371 32,243 33,114 33,985
29,466 30,448 31,431 32,414 33,397 34,379 35,362 36,345 37,328 38,310
33,079 34,181 35,283 36,385 37,487 38,589 39,691 40,793 41,895 42,997
37,009 38,243 39,477 40,711 41,946 43,180 44,414 45,648 46,883 48,117
41,253 42,629 44,005 45,380 46,756 48,131 49,507 50,882 52,258 53,634
45,843 47,372 48,900 50,429 51,957 53,486 55,014 56,543 58,072 59,600
50,770 52,462 54,154 55,845 57,537 59,229 60,921 62,613 64,305 65,997
56,075 57,944 59,814 61,684 63,553 65,423 67,292 69,162 71,031 72,901
61,752 63,811 65,870 67,929 69,988 72,047 74,105 76,164 78,223 80,282
67,845 70,106 72,367 74,628 76,889 79,150 81,411 83,672 85,933 88,194
81,319 84,030 86,741 89,452 92,163 94,874 97,585 100,296 103,007 105,718
96,698 99,921 103,144 106,367 109,591 112,814 116,037 119,260 122,483 125,706
114,268 118,077 121,887 125,696 129,505 133,314 137,123 140,932 144,741 148,550
134,411 138,891 143,372 147,853 152,334 156,814 161,295 161,900 161,900 161,900


Adding column names, grade column and reordering the columns.

names(df) <- str_c("Step ", 1:10)
df <- df %>% 
  mutate(Grade = 1:15) %>% 
  select(11, 1:10)
knitr::kable(df)
Grade Step 1 Step 2 Step 3 Step 4 Step 5 Step 6 Step 7 Step 8 Step 9 Step 10
1 24,019 24,823 25,621 26,416 27,215 27,682 28,471 29,268 29,300 30,041
2 27,005 27,648 28,542 29,300 29,629 30,500 31,371 32,243 33,114 33,985
3 29,466 30,448 31,431 32,414 33,397 34,379 35,362 36,345 37,328 38,310
4 33,079 34,181 35,283 36,385 37,487 38,589 39,691 40,793 41,895 42,997
5 37,009 38,243 39,477 40,711 41,946 43,180 44,414 45,648 46,883 48,117
6 41,253 42,629 44,005 45,380 46,756 48,131 49,507 50,882 52,258 53,634
7 45,843 47,372 48,900 50,429 51,957 53,486 55,014 56,543 58,072 59,600
8 50,770 52,462 54,154 55,845 57,537 59,229 60,921 62,613 64,305 65,997
9 56,075 57,944 59,814 61,684 63,553 65,423 67,292 69,162 71,031 72,901
10 61,752 63,811 65,870 67,929 69,988 72,047 74,105 76,164 78,223 80,282
11 67,845 70,106 72,367 74,628 76,889 79,150 81,411 83,672 85,933 88,194
12 81,319 84,030 86,741 89,452 92,163 94,874 97,585 100,296 103,007 105,718
13 96,698 99,921 103,144 106,367 109,591 112,814 116,037 119,260 122,483 125,706
14 114,268 118,077 121,887 125,696 129,505 133,314 137,123 140,932 144,741 148,550
15 134,411 138,891 143,372 147,853 152,334 156,814 161,295 161,900 161,900 161,900


Gathering Step variables:

df2 <- suppressWarnings(df %>% 
  gather("Step", "value", -Grade))
knitr::kable(head(df2, 30))
Grade Step value
1 Step 1 24,019
2 Step 1 27,005
3 Step 1 29,466
4 Step 1 33,079
5 Step 1 37,009
6 Step 1 41,253
7 Step 1 45,843
8 Step 1 50,770
9 Step 1 56,075
10 Step 1 61,752
11 Step 1 67,845
12 Step 1 81,319
13 Step 1 96,698
14 Step 1 114,268
15 Step 1 134,411
1 Step 2 24,823
2 Step 2 27,648
3 Step 2 30,448
4 Step 2 34,181
5 Step 2 38,243
6 Step 2 42,629
7 Step 2 47,372
8 Step 2 52,462
9 Step 2 57,944
10 Step 2 63,811
11 Step 2 70,106
12 Step 2 84,030
13 Step 2 99,921
14 Step 2 118,077
15 Step 2 138,891


Cleaning and reformatting Step and value (Salary) columns:

df3 <- df2 %>% 
  mutate(
    Step = str_extract_all(Step, "[0-9]{1,2}"),
    Step = as.numeric(Step),
    value = str_replace(value, ",", ""),
    value = as.numeric(value)
  ) %>% 
  rename(Salary = value)
  

knitr::kable(head(df3,15))
Grade Step Salary
1 1 24019
2 1 27005
3 1 29466
4 1 33079
5 1 37009
6 1 41253
7 1 45843
8 1 50770
9 1 56075
10 1 61752
11 1 67845
12 1 81319
13 1 96698
14 1 114268
15 1 134411
knitr::kable(tail(df3,15))
Grade Step Salary
136 1 10 30041
137 2 10 33985
138 3 10 38310
139 4 10 42997
140 5 10 48117
141 6 10 53634
142 7 10 59600
143 8 10 65997
144 9 10 72901
145 10 10 80282
146 11 10 88194
147 12 10 105718
148 13 10 125706
149 14 10 148550
150 15 10 161900


Calculating mean and st. dev for salary grouped by grades:

df3 %>% 
  group_by(Grade) %>% 
  summarise("Mean Salary" = mean(Salary), St.dev = sd(Salary))
## # A tibble: 15 x 3
##    Grade `Mean Salary`    St.dev
##    <int>         <dbl>     <dbl>
##  1     1       27285.6  2038.774
##  2     2       30333.7  2330.467
##  3     3       33888.0  2975.391
##  4     4       38038.0  3336.471
##  5     5       42562.8  3736.855
##  6     6       47443.5  4164.817
##  7     7       52721.6  4627.938
##  8     8       58383.3  5122.399
##  9     9       64487.9  5660.367
## 10    10       71017.1  6233.492
## 11    11       78019.5  6845.517
## 12    12       93518.5  8207.960
## 13    13      111202.1  9758.557
## 14    14      131409.3 11532.614
## 15    15      152067.0 10416.478


Percentage change in mean salary for each grade (as suggested in the post):

tbl <- df3 %>% 
  group_by(Grade) %>% 
  summarise(Mean_Salary = mean(Salary))

tbl %>% 
      mutate(pChange = (Mean_Salary - lag(Mean_Salary))/lag(Mean_Salary) * 100)
## # A tibble: 15 x 3
##    Grade Mean_Salary  pChange
##    <int>       <dbl>    <dbl>
##  1     1     27285.6       NA
##  2     2     30333.7 11.17109
##  3     3     33888.0 11.71733
##  4     4     38038.0 12.24622
##  5     5     42562.8 11.89547
##  6     6     47443.5 11.46706
##  7     7     52721.6 11.12502
##  8     8     58383.3 10.73886
##  9     9     64487.9 10.45607
## 10    10     71017.1 10.12469
## 11    11     78019.5  9.86016
## 12    12     93518.5 19.86555
## 13    13    111202.1 18.90920
## 14    14    131409.3 18.17160
## 15    15    152067.0 15.72012