# pasted from a pdf file - cleaning the data
grade_url <- "https://raw.githubusercontent.com/emrahakin1985/DATA607/master/datasets/Salary_Increase.csv"
grade <- read.csv(grade_url, stringsAsFactors = F)
grade
## Grade.Step.1.Step.2.Step.3.Step.4.Step.5.Step.6.Step.7.Step.8.Step.9.Step.10
## 1 1 $ 24,019 $ 24,823 $ 25,621 $ 26,416 $ 27,215 $ 27,682 $ 28,471 $ 29,268 $ 29,300 $ 30,041
## 2 2 27,005 27,648 28,542 29,300 29,629 30,500 31,371 32,243 33,114 33,985
## 3 3 29,466 30,448 31,431 32,414 33,397 34,379 35,362 36,345 37,328 38,310
## 4 4 33,079 34,181 35,283 36,385 37,487 38,589 39,691 40,793 41,895 42,997
## 5 5 37,009 38,243 39,477 40,711 41,946 43,180 44,414 45,648 46,883 48,117
## 6 6 41,253 42,629 44,005 45,380 46,756 48,131 49,507 50,882 52,258 53,634
## 7 7 45,843 47,372 48,900 50,429 51,957 53,486 55,014 56,543 58,072 59,600
## 8 8 50,770 52,462 54,154 55,845 57,537 59,229 60,921 62,613 64,305 65,997
## 9 9 56,075 57,944 59,814 61,684 63,553 65,423 67,292 69,162 71,031 72,901
## 10 10 61,752 63,811 65,870 67,929 69,988 72,047 74,105 76,164 78,223 80,282
## 11 11 67,845 70,106 72,367 74,628 76,889 79,150 81,411 83,672 85,933 88,194
## 12 12 81,319 84,030 86,741 89,452 92,163 94,874 97,585 100,296 103,007 105,718
## 13 13 96,698 99,921 103,144 106,367 109,591 112,814 116,037 119,260 122,483 125,706
## 14 14 114,268 118,077 121,887 125,696 129,505 133,314 137,123 140,932 144,741 148,550
## 15 15 134,411 138,891 143,372 147,853 152,334 156,814 161,295 161,900 * 161,900 * 161,900 *
a <- str_replace_all(unlist(str_extract_all(unlist(grade), ".+")), "\\$", "")
a
## [1] "1 24,019 24,823 25,621 26,416 27,215 27,682 28,471 29,268 29,300 30,041"
## [2] "2 27,005 27,648 28,542 29,300 29,629 30,500 31,371 32,243 33,114 33,985"
## [3] "3 29,466 30,448 31,431 32,414 33,397 34,379 35,362 36,345 37,328 38,310"
## [4] "4 33,079 34,181 35,283 36,385 37,487 38,589 39,691 40,793 41,895 42,997"
## [5] "5 37,009 38,243 39,477 40,711 41,946 43,180 44,414 45,648 46,883 48,117"
## [6] "6 41,253 42,629 44,005 45,380 46,756 48,131 49,507 50,882 52,258 53,634"
## [7] "7 45,843 47,372 48,900 50,429 51,957 53,486 55,014 56,543 58,072 59,600"
## [8] "8 50,770 52,462 54,154 55,845 57,537 59,229 60,921 62,613 64,305 65,997"
## [9] "9 56,075 57,944 59,814 61,684 63,553 65,423 67,292 69,162 71,031 72,901"
## [10] "10 61,752 63,811 65,870 67,929 69,988 72,047 74,105 76,164 78,223 80,282"
## [11] "11 67,845 70,106 72,367 74,628 76,889 79,150 81,411 83,672 85,933 88,194"
## [12] "12 81,319 84,030 86,741 89,452 92,163 94,874 97,585 100,296 103,007 105,718"
## [13] "13 96,698 99,921 103,144 106,367 109,591 112,814 116,037 119,260 122,483 125,706"
## [14] "14 114,268 118,077 121,887 125,696 129,505 133,314 137,123 140,932 144,741 148,550"
## [15] "15 134,411 138,891 143,372 147,853 152,334 156,814 161,295 161,900 * 161,900 * 161,900 * "
v <- unlist(str_extract_all(a, "([0-9]){2,3},([0-9]){3}"))
v
## [1] "24,019" "24,823" "25,621" "26,416" "27,215" "27,682" "28,471"
## [8] "29,268" "29,300" "30,041" "27,005" "27,648" "28,542" "29,300"
## [15] "29,629" "30,500" "31,371" "32,243" "33,114" "33,985" "29,466"
## [22] "30,448" "31,431" "32,414" "33,397" "34,379" "35,362" "36,345"
## [29] "37,328" "38,310" "33,079" "34,181" "35,283" "36,385" "37,487"
## [36] "38,589" "39,691" "40,793" "41,895" "42,997" "37,009" "38,243"
## [43] "39,477" "40,711" "41,946" "43,180" "44,414" "45,648" "46,883"
## [50] "48,117" "41,253" "42,629" "44,005" "45,380" "46,756" "48,131"
## [57] "49,507" "50,882" "52,258" "53,634" "45,843" "47,372" "48,900"
## [64] "50,429" "51,957" "53,486" "55,014" "56,543" "58,072" "59,600"
## [71] "50,770" "52,462" "54,154" "55,845" "57,537" "59,229" "60,921"
## [78] "62,613" "64,305" "65,997" "56,075" "57,944" "59,814" "61,684"
## [85] "63,553" "65,423" "67,292" "69,162" "71,031" "72,901" "61,752"
## [92] "63,811" "65,870" "67,929" "69,988" "72,047" "74,105" "76,164"
## [99] "78,223" "80,282" "67,845" "70,106" "72,367" "74,628" "76,889"
## [106] "79,150" "81,411" "83,672" "85,933" "88,194" "81,319" "84,030"
## [113] "86,741" "89,452" "92,163" "94,874" "97,585" "100,296" "103,007"
## [120] "105,718" "96,698" "99,921" "103,144" "106,367" "109,591" "112,814"
## [127] "116,037" "119,260" "122,483" "125,706" "114,268" "118,077" "121,887"
## [134] "125,696" "129,505" "133,314" "137,123" "140,932" "144,741" "148,550"
## [141] "134,411" "138,891" "143,372" "147,853" "152,334" "156,814" "161,295"
## [148] "161,900" "161,900" "161,900"
Creating the data frame:
df <- matrix(v, nrow = 10, ncol = 15)
df <- data.frame(t(df))
knitr::kable(df)
X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 |
---|---|---|---|---|---|---|---|---|---|
24,019 | 24,823 | 25,621 | 26,416 | 27,215 | 27,682 | 28,471 | 29,268 | 29,300 | 30,041 |
27,005 | 27,648 | 28,542 | 29,300 | 29,629 | 30,500 | 31,371 | 32,243 | 33,114 | 33,985 |
29,466 | 30,448 | 31,431 | 32,414 | 33,397 | 34,379 | 35,362 | 36,345 | 37,328 | 38,310 |
33,079 | 34,181 | 35,283 | 36,385 | 37,487 | 38,589 | 39,691 | 40,793 | 41,895 | 42,997 |
37,009 | 38,243 | 39,477 | 40,711 | 41,946 | 43,180 | 44,414 | 45,648 | 46,883 | 48,117 |
41,253 | 42,629 | 44,005 | 45,380 | 46,756 | 48,131 | 49,507 | 50,882 | 52,258 | 53,634 |
45,843 | 47,372 | 48,900 | 50,429 | 51,957 | 53,486 | 55,014 | 56,543 | 58,072 | 59,600 |
50,770 | 52,462 | 54,154 | 55,845 | 57,537 | 59,229 | 60,921 | 62,613 | 64,305 | 65,997 |
56,075 | 57,944 | 59,814 | 61,684 | 63,553 | 65,423 | 67,292 | 69,162 | 71,031 | 72,901 |
61,752 | 63,811 | 65,870 | 67,929 | 69,988 | 72,047 | 74,105 | 76,164 | 78,223 | 80,282 |
67,845 | 70,106 | 72,367 | 74,628 | 76,889 | 79,150 | 81,411 | 83,672 | 85,933 | 88,194 |
81,319 | 84,030 | 86,741 | 89,452 | 92,163 | 94,874 | 97,585 | 100,296 | 103,007 | 105,718 |
96,698 | 99,921 | 103,144 | 106,367 | 109,591 | 112,814 | 116,037 | 119,260 | 122,483 | 125,706 |
114,268 | 118,077 | 121,887 | 125,696 | 129,505 | 133,314 | 137,123 | 140,932 | 144,741 | 148,550 |
134,411 | 138,891 | 143,372 | 147,853 | 152,334 | 156,814 | 161,295 | 161,900 | 161,900 | 161,900 |
Adding column names, grade column and reordering the columns.
names(df) <- str_c("Step ", 1:10)
df <- df %>%
mutate(Grade = 1:15) %>%
select(11, 1:10)
knitr::kable(df)
Grade | Step 1 | Step 2 | Step 3 | Step 4 | Step 5 | Step 6 | Step 7 | Step 8 | Step 9 | Step 10 |
---|---|---|---|---|---|---|---|---|---|---|
1 | 24,019 | 24,823 | 25,621 | 26,416 | 27,215 | 27,682 | 28,471 | 29,268 | 29,300 | 30,041 |
2 | 27,005 | 27,648 | 28,542 | 29,300 | 29,629 | 30,500 | 31,371 | 32,243 | 33,114 | 33,985 |
3 | 29,466 | 30,448 | 31,431 | 32,414 | 33,397 | 34,379 | 35,362 | 36,345 | 37,328 | 38,310 |
4 | 33,079 | 34,181 | 35,283 | 36,385 | 37,487 | 38,589 | 39,691 | 40,793 | 41,895 | 42,997 |
5 | 37,009 | 38,243 | 39,477 | 40,711 | 41,946 | 43,180 | 44,414 | 45,648 | 46,883 | 48,117 |
6 | 41,253 | 42,629 | 44,005 | 45,380 | 46,756 | 48,131 | 49,507 | 50,882 | 52,258 | 53,634 |
7 | 45,843 | 47,372 | 48,900 | 50,429 | 51,957 | 53,486 | 55,014 | 56,543 | 58,072 | 59,600 |
8 | 50,770 | 52,462 | 54,154 | 55,845 | 57,537 | 59,229 | 60,921 | 62,613 | 64,305 | 65,997 |
9 | 56,075 | 57,944 | 59,814 | 61,684 | 63,553 | 65,423 | 67,292 | 69,162 | 71,031 | 72,901 |
10 | 61,752 | 63,811 | 65,870 | 67,929 | 69,988 | 72,047 | 74,105 | 76,164 | 78,223 | 80,282 |
11 | 67,845 | 70,106 | 72,367 | 74,628 | 76,889 | 79,150 | 81,411 | 83,672 | 85,933 | 88,194 |
12 | 81,319 | 84,030 | 86,741 | 89,452 | 92,163 | 94,874 | 97,585 | 100,296 | 103,007 | 105,718 |
13 | 96,698 | 99,921 | 103,144 | 106,367 | 109,591 | 112,814 | 116,037 | 119,260 | 122,483 | 125,706 |
14 | 114,268 | 118,077 | 121,887 | 125,696 | 129,505 | 133,314 | 137,123 | 140,932 | 144,741 | 148,550 |
15 | 134,411 | 138,891 | 143,372 | 147,853 | 152,334 | 156,814 | 161,295 | 161,900 | 161,900 | 161,900 |
Gathering Step variables:
df2 <- suppressWarnings(df %>%
gather("Step", "value", -Grade))
knitr::kable(head(df2, 30))
Grade | Step | value |
---|---|---|
1 | Step 1 | 24,019 |
2 | Step 1 | 27,005 |
3 | Step 1 | 29,466 |
4 | Step 1 | 33,079 |
5 | Step 1 | 37,009 |
6 | Step 1 | 41,253 |
7 | Step 1 | 45,843 |
8 | Step 1 | 50,770 |
9 | Step 1 | 56,075 |
10 | Step 1 | 61,752 |
11 | Step 1 | 67,845 |
12 | Step 1 | 81,319 |
13 | Step 1 | 96,698 |
14 | Step 1 | 114,268 |
15 | Step 1 | 134,411 |
1 | Step 2 | 24,823 |
2 | Step 2 | 27,648 |
3 | Step 2 | 30,448 |
4 | Step 2 | 34,181 |
5 | Step 2 | 38,243 |
6 | Step 2 | 42,629 |
7 | Step 2 | 47,372 |
8 | Step 2 | 52,462 |
9 | Step 2 | 57,944 |
10 | Step 2 | 63,811 |
11 | Step 2 | 70,106 |
12 | Step 2 | 84,030 |
13 | Step 2 | 99,921 |
14 | Step 2 | 118,077 |
15 | Step 2 | 138,891 |
Cleaning and reformatting Step and value (Salary) columns:
df3 <- df2 %>%
mutate(
Step = str_extract_all(Step, "[0-9]{1,2}"),
Step = as.numeric(Step),
value = str_replace(value, ",", ""),
value = as.numeric(value)
) %>%
rename(Salary = value)
knitr::kable(head(df3,15))
Grade | Step | Salary |
---|---|---|
1 | 1 | 24019 |
2 | 1 | 27005 |
3 | 1 | 29466 |
4 | 1 | 33079 |
5 | 1 | 37009 |
6 | 1 | 41253 |
7 | 1 | 45843 |
8 | 1 | 50770 |
9 | 1 | 56075 |
10 | 1 | 61752 |
11 | 1 | 67845 |
12 | 1 | 81319 |
13 | 1 | 96698 |
14 | 1 | 114268 |
15 | 1 | 134411 |
knitr::kable(tail(df3,15))
Grade | Step | Salary | |
---|---|---|---|
136 | 1 | 10 | 30041 |
137 | 2 | 10 | 33985 |
138 | 3 | 10 | 38310 |
139 | 4 | 10 | 42997 |
140 | 5 | 10 | 48117 |
141 | 6 | 10 | 53634 |
142 | 7 | 10 | 59600 |
143 | 8 | 10 | 65997 |
144 | 9 | 10 | 72901 |
145 | 10 | 10 | 80282 |
146 | 11 | 10 | 88194 |
147 | 12 | 10 | 105718 |
148 | 13 | 10 | 125706 |
149 | 14 | 10 | 148550 |
150 | 15 | 10 | 161900 |
Calculating mean and st. dev for salary grouped by grades:
df3 %>%
group_by(Grade) %>%
summarise("Mean Salary" = mean(Salary), St.dev = sd(Salary))
## # A tibble: 15 x 3
## Grade `Mean Salary` St.dev
## <int> <dbl> <dbl>
## 1 1 27285.6 2038.774
## 2 2 30333.7 2330.467
## 3 3 33888.0 2975.391
## 4 4 38038.0 3336.471
## 5 5 42562.8 3736.855
## 6 6 47443.5 4164.817
## 7 7 52721.6 4627.938
## 8 8 58383.3 5122.399
## 9 9 64487.9 5660.367
## 10 10 71017.1 6233.492
## 11 11 78019.5 6845.517
## 12 12 93518.5 8207.960
## 13 13 111202.1 9758.557
## 14 14 131409.3 11532.614
## 15 15 152067.0 10416.478
Percentage change in mean salary for each grade (as suggested in the post):
tbl <- df3 %>%
group_by(Grade) %>%
summarise(Mean_Salary = mean(Salary))
tbl %>%
mutate(pChange = (Mean_Salary - lag(Mean_Salary))/lag(Mean_Salary) * 100)
## # A tibble: 15 x 3
## Grade Mean_Salary pChange
## <int> <dbl> <dbl>
## 1 1 27285.6 NA
## 2 2 30333.7 11.17109
## 3 3 33888.0 11.71733
## 4 4 38038.0 12.24622
## 5 5 42562.8 11.89547
## 6 6 47443.5 11.46706
## 7 7 52721.6 11.12502
## 8 8 58383.3 10.73886
## 9 9 64487.9 10.45607
## 10 10 71017.1 10.12469
## 11 11 78019.5 9.86016
## 12 12 93518.5 19.86555
## 13 13 111202.1 18.90920
## 14 14 131409.3 18.17160
## 15 15 152067.0 15.72012