Introduction

These are the codes I used for the edX Data Wrangling Course, offered by Harvard University. Feel free to use as an example and reference and replicate the codes if needed. As always, comments are welcomed.

Section 1: Data Import

Assessment Part 2: Data Import

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
library(readr)
data <- read_csv(url,col_names = F)
nrow(data) #calculate number of rows
## [1] 569
ncol(data) #calculate number of columns
## [1] 32

Section 2: Tidy Data

Assessment Part 2: Reshaping Data

CO2 dataset questions

library(tidyverse)
library(dslabs)
co2
##         Jan    Feb    Mar    Apr    May    Jun    Jul    Aug    Sep    Oct
## 1959 315.42 316.31 316.50 317.56 318.13 318.00 316.39 314.65 313.68 313.18
## 1960 316.27 316.81 317.42 318.87 319.87 319.43 318.01 315.74 314.00 313.68
## 1961 316.73 317.54 318.38 319.31 320.42 319.61 318.42 316.63 314.83 315.16
## 1962 317.78 318.40 319.53 320.42 320.85 320.45 319.45 317.25 316.11 315.27
## 1963 318.58 318.92 319.70 321.22 322.08 321.31 319.58 317.61 316.05 315.83
## 1964 319.41 320.07 320.74 321.40 322.06 321.73 320.27 318.54 316.54 316.71
## 1965 319.27 320.28 320.73 321.97 322.00 321.71 321.05 318.71 317.66 317.14
## 1966 320.46 321.43 322.23 323.54 323.91 323.59 322.24 320.20 318.48 317.94
## 1967 322.17 322.34 322.88 324.25 324.83 323.93 322.38 320.76 319.10 319.24
## 1968 322.40 322.99 323.73 324.86 325.40 325.20 323.98 321.95 320.18 320.09
## 1969 323.83 324.26 325.47 326.50 327.21 326.54 325.72 323.50 322.22 321.62
## 1970 324.89 325.82 326.77 327.97 327.91 327.50 326.18 324.53 322.93 322.90
## 1971 326.01 326.51 327.01 327.62 328.76 328.40 327.20 325.27 323.20 323.40
## 1972 326.60 327.47 327.58 329.56 329.90 328.92 327.88 326.16 324.68 325.04
## 1973 328.37 329.40 330.14 331.33 332.31 331.90 330.70 329.15 327.35 327.02
## 1974 329.18 330.55 331.32 332.48 332.92 332.08 331.01 329.23 327.27 327.21
## 1975 330.23 331.25 331.87 333.14 333.80 333.43 331.73 329.90 328.40 328.17
## 1976 331.58 332.39 333.33 334.41 334.71 334.17 332.89 330.77 329.14 328.78
## 1977 332.75 333.24 334.53 335.90 336.57 336.10 334.76 332.59 331.42 330.98
## 1978 334.80 335.22 336.47 337.59 337.84 337.72 336.37 334.51 332.60 332.38
## 1979 336.05 336.59 337.79 338.71 339.30 339.12 337.56 335.92 333.75 333.70
## 1980 337.84 338.19 339.91 340.60 341.29 341.00 339.39 337.43 335.72 335.84
## 1981 339.06 340.30 341.21 342.33 342.74 342.08 340.32 338.26 336.52 336.68
## 1982 340.57 341.44 342.53 343.39 343.96 343.18 341.88 339.65 337.81 337.69
## 1983 341.20 342.35 342.93 344.77 345.58 345.14 343.81 342.21 339.69 339.82
## 1984 343.52 344.33 345.11 346.88 347.25 346.62 345.22 343.11 340.90 341.18
## 1985 344.79 345.82 347.25 348.17 348.74 348.07 346.38 344.51 342.92 342.62
## 1986 346.11 346.78 347.68 349.37 350.03 349.37 347.76 345.73 344.68 343.99
## 1987 347.84 348.29 349.23 350.80 351.66 351.07 349.33 347.92 346.27 346.18
## 1988 350.25 351.54 352.05 353.41 354.04 353.62 352.22 350.27 348.55 348.72
## 1989 352.60 352.92 353.53 355.26 355.52 354.97 353.75 351.52 349.64 349.83
## 1990 353.50 354.55 355.23 356.04 357.00 356.07 354.67 352.76 350.82 351.04
## 1991 354.59 355.63 357.03 358.48 359.22 358.12 356.06 353.92 352.05 352.11
## 1992 355.88 356.63 357.72 359.07 359.58 359.17 356.94 354.92 352.94 353.23
## 1993 356.63 357.10 358.32 359.41 360.23 359.55 357.53 355.48 353.67 353.95
## 1994 358.34 358.89 359.95 361.25 361.67 360.94 359.55 357.49 355.84 356.00
## 1995 359.98 361.03 361.66 363.48 363.82 363.30 361.94 359.50 358.11 357.80
## 1996 362.09 363.29 364.06 364.76 365.45 365.01 363.70 361.54 359.51 359.65
## 1997 363.23 364.06 364.61 366.40 366.84 365.68 364.52 362.57 360.24 360.83
##         Nov    Dec
## 1959 314.66 315.43
## 1960 314.84 316.03
## 1961 315.94 316.85
## 1962 316.53 317.53
## 1963 316.91 318.20
## 1964 317.53 318.55
## 1965 318.70 319.25
## 1966 319.63 320.87
## 1967 320.56 321.80
## 1968 321.16 322.74
## 1969 322.69 323.95
## 1970 323.85 324.96
## 1971 324.63 325.85
## 1972 326.34 327.39
## 1973 327.99 328.48
## 1974 328.29 329.41
## 1975 329.32 330.59
## 1976 330.14 331.52
## 1977 332.24 333.68
## 1978 333.75 334.78
## 1979 335.12 336.56
## 1980 336.93 338.04
## 1981 338.19 339.44
## 1982 339.09 340.32
## 1983 340.98 342.82
## 1984 342.80 344.04
## 1985 344.06 345.38
## 1986 345.48 346.72
## 1987 347.64 348.78
## 1988 349.91 351.18
## 1989 351.14 352.37
## 1990 352.69 354.07
## 1991 353.64 354.89
## 1992 354.09 355.33
## 1993 355.30 356.78
## 1994 357.59 359.05
## 1995 359.61 360.74
## 1996 360.80 362.38
## 1997 362.49 364.34
co2_wide <- data.frame(matrix(co2, ncol = 12, byrow = TRUE)) %>% 
      setNames(1:12) %>%
    mutate(year = as.character(1959:1997))
co2_wide
##         1      2      3      4      5      6      7      8      9     10     11
## 1  315.42 316.31 316.50 317.56 318.13 318.00 316.39 314.65 313.68 313.18 314.66
## 2  316.27 316.81 317.42 318.87 319.87 319.43 318.01 315.74 314.00 313.68 314.84
## 3  316.73 317.54 318.38 319.31 320.42 319.61 318.42 316.63 314.83 315.16 315.94
## 4  317.78 318.40 319.53 320.42 320.85 320.45 319.45 317.25 316.11 315.27 316.53
## 5  318.58 318.92 319.70 321.22 322.08 321.31 319.58 317.61 316.05 315.83 316.91
## 6  319.41 320.07 320.74 321.40 322.06 321.73 320.27 318.54 316.54 316.71 317.53
## 7  319.27 320.28 320.73 321.97 322.00 321.71 321.05 318.71 317.66 317.14 318.70
## 8  320.46 321.43 322.23 323.54 323.91 323.59 322.24 320.20 318.48 317.94 319.63
## 9  322.17 322.34 322.88 324.25 324.83 323.93 322.38 320.76 319.10 319.24 320.56
## 10 322.40 322.99 323.73 324.86 325.40 325.20 323.98 321.95 320.18 320.09 321.16
## 11 323.83 324.26 325.47 326.50 327.21 326.54 325.72 323.50 322.22 321.62 322.69
## 12 324.89 325.82 326.77 327.97 327.91 327.50 326.18 324.53 322.93 322.90 323.85
## 13 326.01 326.51 327.01 327.62 328.76 328.40 327.20 325.27 323.20 323.40 324.63
## 14 326.60 327.47 327.58 329.56 329.90 328.92 327.88 326.16 324.68 325.04 326.34
## 15 328.37 329.40 330.14 331.33 332.31 331.90 330.70 329.15 327.35 327.02 327.99
## 16 329.18 330.55 331.32 332.48 332.92 332.08 331.01 329.23 327.27 327.21 328.29
## 17 330.23 331.25 331.87 333.14 333.80 333.43 331.73 329.90 328.40 328.17 329.32
## 18 331.58 332.39 333.33 334.41 334.71 334.17 332.89 330.77 329.14 328.78 330.14
## 19 332.75 333.24 334.53 335.90 336.57 336.10 334.76 332.59 331.42 330.98 332.24
## 20 334.80 335.22 336.47 337.59 337.84 337.72 336.37 334.51 332.60 332.38 333.75
## 21 336.05 336.59 337.79 338.71 339.30 339.12 337.56 335.92 333.75 333.70 335.12
## 22 337.84 338.19 339.91 340.60 341.29 341.00 339.39 337.43 335.72 335.84 336.93
## 23 339.06 340.30 341.21 342.33 342.74 342.08 340.32 338.26 336.52 336.68 338.19
## 24 340.57 341.44 342.53 343.39 343.96 343.18 341.88 339.65 337.81 337.69 339.09
## 25 341.20 342.35 342.93 344.77 345.58 345.14 343.81 342.21 339.69 339.82 340.98
## 26 343.52 344.33 345.11 346.88 347.25 346.62 345.22 343.11 340.90 341.18 342.80
## 27 344.79 345.82 347.25 348.17 348.74 348.07 346.38 344.51 342.92 342.62 344.06
## 28 346.11 346.78 347.68 349.37 350.03 349.37 347.76 345.73 344.68 343.99 345.48
## 29 347.84 348.29 349.23 350.80 351.66 351.07 349.33 347.92 346.27 346.18 347.64
## 30 350.25 351.54 352.05 353.41 354.04 353.62 352.22 350.27 348.55 348.72 349.91
## 31 352.60 352.92 353.53 355.26 355.52 354.97 353.75 351.52 349.64 349.83 351.14
## 32 353.50 354.55 355.23 356.04 357.00 356.07 354.67 352.76 350.82 351.04 352.69
## 33 354.59 355.63 357.03 358.48 359.22 358.12 356.06 353.92 352.05 352.11 353.64
## 34 355.88 356.63 357.72 359.07 359.58 359.17 356.94 354.92 352.94 353.23 354.09
## 35 356.63 357.10 358.32 359.41 360.23 359.55 357.53 355.48 353.67 353.95 355.30
## 36 358.34 358.89 359.95 361.25 361.67 360.94 359.55 357.49 355.84 356.00 357.59
## 37 359.98 361.03 361.66 363.48 363.82 363.30 361.94 359.50 358.11 357.80 359.61
## 38 362.09 363.29 364.06 364.76 365.45 365.01 363.70 361.54 359.51 359.65 360.80
## 39 363.23 364.06 364.61 366.40 366.84 365.68 364.52 362.57 360.24 360.83 362.49
##        12 year
## 1  315.43 1959
## 2  316.03 1960
## 3  316.85 1961
## 4  317.53 1962
## 5  318.20 1963
## 6  318.55 1964
## 7  319.25 1965
## 8  320.87 1966
## 9  321.80 1967
## 10 322.74 1968
## 11 323.95 1969
## 12 324.96 1970
## 13 325.85 1971
## 14 327.39 1972
## 15 328.48 1973
## 16 329.41 1974
## 17 330.59 1975
## 18 331.52 1976
## 19 333.68 1977
## 20 334.78 1978
## 21 336.56 1979
## 22 338.04 1980
## 23 339.44 1981
## 24 340.32 1982
## 25 342.82 1983
## 26 344.04 1984
## 27 345.38 1985
## 28 346.72 1986
## 29 348.78 1987
## 30 351.18 1988
## 31 352.37 1989
## 32 354.07 1990
## 33 354.89 1991
## 34 355.33 1992
## 35 356.78 1993
## 36 359.05 1994
## 37 360.74 1995
## 38 362.38 1996
## 39 364.34 1997
co2_tidy <- gather(co2_wide,month,co2,-year)
co2_tidy
##     year month    co2
## 1   1959     1 315.42
## 2   1960     1 316.27
## 3   1961     1 316.73
## 4   1962     1 317.78
## 5   1963     1 318.58
## 6   1964     1 319.41
## 7   1965     1 319.27
## 8   1966     1 320.46
## 9   1967     1 322.17
## 10  1968     1 322.40
## 11  1969     1 323.83
## 12  1970     1 324.89
## 13  1971     1 326.01
## 14  1972     1 326.60
## 15  1973     1 328.37
## 16  1974     1 329.18
## 17  1975     1 330.23
## 18  1976     1 331.58
## 19  1977     1 332.75
## 20  1978     1 334.80
## 21  1979     1 336.05
## 22  1980     1 337.84
## 23  1981     1 339.06
## 24  1982     1 340.57
## 25  1983     1 341.20
## 26  1984     1 343.52
## 27  1985     1 344.79
## 28  1986     1 346.11
## 29  1987     1 347.84
## 30  1988     1 350.25
## 31  1989     1 352.60
## 32  1990     1 353.50
## 33  1991     1 354.59
## 34  1992     1 355.88
## 35  1993     1 356.63
## 36  1994     1 358.34
## 37  1995     1 359.98
## 38  1996     1 362.09
## 39  1997     1 363.23
## 40  1959     2 316.31
## 41  1960     2 316.81
## 42  1961     2 317.54
## 43  1962     2 318.40
## 44  1963     2 318.92
## 45  1964     2 320.07
## 46  1965     2 320.28
## 47  1966     2 321.43
## 48  1967     2 322.34
## 49  1968     2 322.99
## 50  1969     2 324.26
## 51  1970     2 325.82
## 52  1971     2 326.51
## 53  1972     2 327.47
## 54  1973     2 329.40
## 55  1974     2 330.55
## 56  1975     2 331.25
## 57  1976     2 332.39
## 58  1977     2 333.24
## 59  1978     2 335.22
## 60  1979     2 336.59
## 61  1980     2 338.19
## 62  1981     2 340.30
## 63  1982     2 341.44
## 64  1983     2 342.35
## 65  1984     2 344.33
## 66  1985     2 345.82
## 67  1986     2 346.78
## 68  1987     2 348.29
## 69  1988     2 351.54
## 70  1989     2 352.92
## 71  1990     2 354.55
## 72  1991     2 355.63
## 73  1992     2 356.63
## 74  1993     2 357.10
## 75  1994     2 358.89
## 76  1995     2 361.03
## 77  1996     2 363.29
## 78  1997     2 364.06
## 79  1959     3 316.50
## 80  1960     3 317.42
## 81  1961     3 318.38
## 82  1962     3 319.53
## 83  1963     3 319.70
## 84  1964     3 320.74
## 85  1965     3 320.73
## 86  1966     3 322.23
## 87  1967     3 322.88
## 88  1968     3 323.73
## 89  1969     3 325.47
## 90  1970     3 326.77
## 91  1971     3 327.01
## 92  1972     3 327.58
## 93  1973     3 330.14
## 94  1974     3 331.32
## 95  1975     3 331.87
## 96  1976     3 333.33
## 97  1977     3 334.53
## 98  1978     3 336.47
## 99  1979     3 337.79
## 100 1980     3 339.91
## 101 1981     3 341.21
## 102 1982     3 342.53
## 103 1983     3 342.93
## 104 1984     3 345.11
## 105 1985     3 347.25
## 106 1986     3 347.68
## 107 1987     3 349.23
## 108 1988     3 352.05
## 109 1989     3 353.53
## 110 1990     3 355.23
## 111 1991     3 357.03
## 112 1992     3 357.72
## 113 1993     3 358.32
## 114 1994     3 359.95
## 115 1995     3 361.66
## 116 1996     3 364.06
## 117 1997     3 364.61
## 118 1959     4 317.56
## 119 1960     4 318.87
## 120 1961     4 319.31
## 121 1962     4 320.42
## 122 1963     4 321.22
## 123 1964     4 321.40
## 124 1965     4 321.97
## 125 1966     4 323.54
## 126 1967     4 324.25
## 127 1968     4 324.86
## 128 1969     4 326.50
## 129 1970     4 327.97
## 130 1971     4 327.62
## 131 1972     4 329.56
## 132 1973     4 331.33
## 133 1974     4 332.48
## 134 1975     4 333.14
## 135 1976     4 334.41
## 136 1977     4 335.90
## 137 1978     4 337.59
## 138 1979     4 338.71
## 139 1980     4 340.60
## 140 1981     4 342.33
## 141 1982     4 343.39
## 142 1983     4 344.77
## 143 1984     4 346.88
## 144 1985     4 348.17
## 145 1986     4 349.37
## 146 1987     4 350.80
## 147 1988     4 353.41
## 148 1989     4 355.26
## 149 1990     4 356.04
## 150 1991     4 358.48
## 151 1992     4 359.07
## 152 1993     4 359.41
## 153 1994     4 361.25
## 154 1995     4 363.48
## 155 1996     4 364.76
## 156 1997     4 366.40
## 157 1959     5 318.13
## 158 1960     5 319.87
## 159 1961     5 320.42
## 160 1962     5 320.85
## 161 1963     5 322.08
## 162 1964     5 322.06
## 163 1965     5 322.00
## 164 1966     5 323.91
## 165 1967     5 324.83
## 166 1968     5 325.40
## 167 1969     5 327.21
## 168 1970     5 327.91
## 169 1971     5 328.76
## 170 1972     5 329.90
## 171 1973     5 332.31
## 172 1974     5 332.92
## 173 1975     5 333.80
## 174 1976     5 334.71
## 175 1977     5 336.57
## 176 1978     5 337.84
## 177 1979     5 339.30
## 178 1980     5 341.29
## 179 1981     5 342.74
## 180 1982     5 343.96
## 181 1983     5 345.58
## 182 1984     5 347.25
## 183 1985     5 348.74
## 184 1986     5 350.03
## 185 1987     5 351.66
## 186 1988     5 354.04
## 187 1989     5 355.52
## 188 1990     5 357.00
## 189 1991     5 359.22
## 190 1992     5 359.58
## 191 1993     5 360.23
## 192 1994     5 361.67
## 193 1995     5 363.82
## 194 1996     5 365.45
## 195 1997     5 366.84
## 196 1959     6 318.00
## 197 1960     6 319.43
## 198 1961     6 319.61
## 199 1962     6 320.45
## 200 1963     6 321.31
## 201 1964     6 321.73
## 202 1965     6 321.71
## 203 1966     6 323.59
## 204 1967     6 323.93
## 205 1968     6 325.20
## 206 1969     6 326.54
## 207 1970     6 327.50
## 208 1971     6 328.40
## 209 1972     6 328.92
## 210 1973     6 331.90
## 211 1974     6 332.08
## 212 1975     6 333.43
## 213 1976     6 334.17
## 214 1977     6 336.10
## 215 1978     6 337.72
## 216 1979     6 339.12
## 217 1980     6 341.00
## 218 1981     6 342.08
## 219 1982     6 343.18
## 220 1983     6 345.14
## 221 1984     6 346.62
## 222 1985     6 348.07
## 223 1986     6 349.37
## 224 1987     6 351.07
## 225 1988     6 353.62
## 226 1989     6 354.97
## 227 1990     6 356.07
## 228 1991     6 358.12
## 229 1992     6 359.17
## 230 1993     6 359.55
## 231 1994     6 360.94
## 232 1995     6 363.30
## 233 1996     6 365.01
## 234 1997     6 365.68
## 235 1959     7 316.39
## 236 1960     7 318.01
## 237 1961     7 318.42
## 238 1962     7 319.45
## 239 1963     7 319.58
## 240 1964     7 320.27
## 241 1965     7 321.05
## 242 1966     7 322.24
## 243 1967     7 322.38
## 244 1968     7 323.98
## 245 1969     7 325.72
## 246 1970     7 326.18
## 247 1971     7 327.20
## 248 1972     7 327.88
## 249 1973     7 330.70
## 250 1974     7 331.01
## 251 1975     7 331.73
## 252 1976     7 332.89
## 253 1977     7 334.76
## 254 1978     7 336.37
## 255 1979     7 337.56
## 256 1980     7 339.39
## 257 1981     7 340.32
## 258 1982     7 341.88
## 259 1983     7 343.81
## 260 1984     7 345.22
## 261 1985     7 346.38
## 262 1986     7 347.76
## 263 1987     7 349.33
## 264 1988     7 352.22
## 265 1989     7 353.75
## 266 1990     7 354.67
## 267 1991     7 356.06
## 268 1992     7 356.94
## 269 1993     7 357.53
## 270 1994     7 359.55
## 271 1995     7 361.94
## 272 1996     7 363.70
## 273 1997     7 364.52
## 274 1959     8 314.65
## 275 1960     8 315.74
## 276 1961     8 316.63
## 277 1962     8 317.25
## 278 1963     8 317.61
## 279 1964     8 318.54
## 280 1965     8 318.71
## 281 1966     8 320.20
## 282 1967     8 320.76
## 283 1968     8 321.95
## 284 1969     8 323.50
## 285 1970     8 324.53
## 286 1971     8 325.27
## 287 1972     8 326.16
## 288 1973     8 329.15
## 289 1974     8 329.23
## 290 1975     8 329.90
## 291 1976     8 330.77
## 292 1977     8 332.59
## 293 1978     8 334.51
## 294 1979     8 335.92
## 295 1980     8 337.43
## 296 1981     8 338.26
## 297 1982     8 339.65
## 298 1983     8 342.21
## 299 1984     8 343.11
## 300 1985     8 344.51
## 301 1986     8 345.73
## 302 1987     8 347.92
## 303 1988     8 350.27
## 304 1989     8 351.52
## 305 1990     8 352.76
## 306 1991     8 353.92
## 307 1992     8 354.92
## 308 1993     8 355.48
## 309 1994     8 357.49
## 310 1995     8 359.50
## 311 1996     8 361.54
## 312 1997     8 362.57
## 313 1959     9 313.68
## 314 1960     9 314.00
## 315 1961     9 314.83
## 316 1962     9 316.11
## 317 1963     9 316.05
## 318 1964     9 316.54
## 319 1965     9 317.66
## 320 1966     9 318.48
## 321 1967     9 319.10
## 322 1968     9 320.18
## 323 1969     9 322.22
## 324 1970     9 322.93
## 325 1971     9 323.20
## 326 1972     9 324.68
## 327 1973     9 327.35
## 328 1974     9 327.27
## 329 1975     9 328.40
## 330 1976     9 329.14
## 331 1977     9 331.42
## 332 1978     9 332.60
## 333 1979     9 333.75
## 334 1980     9 335.72
## 335 1981     9 336.52
## 336 1982     9 337.81
## 337 1983     9 339.69
## 338 1984     9 340.90
## 339 1985     9 342.92
## 340 1986     9 344.68
## 341 1987     9 346.27
## 342 1988     9 348.55
## 343 1989     9 349.64
## 344 1990     9 350.82
## 345 1991     9 352.05
## 346 1992     9 352.94
## 347 1993     9 353.67
## 348 1994     9 355.84
## 349 1995     9 358.11
## 350 1996     9 359.51
## 351 1997     9 360.24
## 352 1959    10 313.18
## 353 1960    10 313.68
## 354 1961    10 315.16
## 355 1962    10 315.27
## 356 1963    10 315.83
## 357 1964    10 316.71
## 358 1965    10 317.14
## 359 1966    10 317.94
## 360 1967    10 319.24
## 361 1968    10 320.09
## 362 1969    10 321.62
## 363 1970    10 322.90
## 364 1971    10 323.40
## 365 1972    10 325.04
## 366 1973    10 327.02
## 367 1974    10 327.21
## 368 1975    10 328.17
## 369 1976    10 328.78
## 370 1977    10 330.98
## 371 1978    10 332.38
## 372 1979    10 333.70
## 373 1980    10 335.84
## 374 1981    10 336.68
## 375 1982    10 337.69
## 376 1983    10 339.82
## 377 1984    10 341.18
## 378 1985    10 342.62
## 379 1986    10 343.99
## 380 1987    10 346.18
## 381 1988    10 348.72
## 382 1989    10 349.83
## 383 1990    10 351.04
## 384 1991    10 352.11
## 385 1992    10 353.23
## 386 1993    10 353.95
## 387 1994    10 356.00
## 388 1995    10 357.80
## 389 1996    10 359.65
## 390 1997    10 360.83
## 391 1959    11 314.66
## 392 1960    11 314.84
## 393 1961    11 315.94
## 394 1962    11 316.53
## 395 1963    11 316.91
## 396 1964    11 317.53
## 397 1965    11 318.70
## 398 1966    11 319.63
## 399 1967    11 320.56
## 400 1968    11 321.16
## 401 1969    11 322.69
## 402 1970    11 323.85
## 403 1971    11 324.63
## 404 1972    11 326.34
## 405 1973    11 327.99
## 406 1974    11 328.29
## 407 1975    11 329.32
## 408 1976    11 330.14
## 409 1977    11 332.24
## 410 1978    11 333.75
## 411 1979    11 335.12
## 412 1980    11 336.93
## 413 1981    11 338.19
## 414 1982    11 339.09
## 415 1983    11 340.98
## 416 1984    11 342.80
## 417 1985    11 344.06
## 418 1986    11 345.48
## 419 1987    11 347.64
## 420 1988    11 349.91
## 421 1989    11 351.14
## 422 1990    11 352.69
## 423 1991    11 353.64
## 424 1992    11 354.09
## 425 1993    11 355.30
## 426 1994    11 357.59
## 427 1995    11 359.61
## 428 1996    11 360.80
## 429 1997    11 362.49
## 430 1959    12 315.43
## 431 1960    12 316.03
## 432 1961    12 316.85
## 433 1962    12 317.53
## 434 1963    12 318.20
## 435 1964    12 318.55
## 436 1965    12 319.25
## 437 1966    12 320.87
## 438 1967    12 321.80
## 439 1968    12 322.74
## 440 1969    12 323.95
## 441 1970    12 324.96
## 442 1971    12 325.85
## 443 1972    12 327.39
## 444 1973    12 328.48
## 445 1974    12 329.41
## 446 1975    12 330.59
## 447 1976    12 331.52
## 448 1977    12 333.68
## 449 1978    12 334.78
## 450 1979    12 336.56
## 451 1980    12 338.04
## 452 1981    12 339.44
## 453 1982    12 340.32
## 454 1983    12 342.82
## 455 1984    12 344.04
## 456 1985    12 345.38
## 457 1986    12 346.72
## 458 1987    12 348.78
## 459 1988    12 351.18
## 460 1989    12 352.37
## 461 1990    12 354.07
## 462 1991    12 354.89
## 463 1992    12 355.33
## 464 1993    12 356.78
## 465 1994    12 359.05
## 466 1995    12 360.74
## 467 1996    12 362.38
## 468 1997    12 364.34
co2_tidy %>% ggplot(aes(as.numeric(month), co2, color = year)) + geom_line()

Admission dataset questions

library(dslabs)
data(admissions)
dat <- admissions %>% select(-applicants)

dat_tidy <- spread(dat, gender,admitted)

tmp <- gather(admissions, key, value, admitted:applicants)
tmp
##    major gender        key value
## 1      A    men   admitted    62
## 2      B    men   admitted    63
## 3      C    men   admitted    37
## 4      D    men   admitted    33
## 5      E    men   admitted    28
## 6      F    men   admitted     6
## 7      A  women   admitted    82
## 8      B  women   admitted    68
## 9      C  women   admitted    34
## 10     D  women   admitted    35
## 11     E  women   admitted    24
## 12     F  women   admitted     7
## 13     A    men applicants   825
## 14     B    men applicants   560
## 15     C    men applicants   325
## 16     D    men applicants   417
## 17     E    men applicants   191
## 18     F    men applicants   373
## 19     A  women applicants   108
## 20     B  women applicants    25
## 21     C  women applicants   593
## 22     D  women applicants   375
## 23     E  women applicants   393
## 24     F  women applicants   341
tmp2 <- unite(tmp,column_name,c(key,gender))

Assessment: Combining Tables

library(Lahman)
top <- Batting %>% 
  filter(yearID == 2016) %>%
  arrange(desc(HR)) %>%    # arrange by descending HR count
  slice(1:10)    # take entries 1-10
top %>% as_tibble()
## # A tibble: 10 x 22
##    playerID  yearID stint teamID lgID      G    AB     R     H   X2B   X3B    HR
##    <chr>      <int> <int> <fct>  <fct> <int> <int> <int> <int> <int> <int> <int>
##  1 trumbma01   2016     1 BAL    AL      159   613    94   157    27     1    47
##  2 cruzne02    2016     1 SEA    AL      155   589    96   169    27     1    43
##  3 daviskh01   2016     1 OAK    AL      150   555    85   137    24     2    42
##  4 doziebr01   2016     1 MIN    AL      155   615   104   165    35     5    42
##  5 encared01   2016     1 TOR    AL      160   601    99   158    34     0    42
##  6 arenano01   2016     1 COL    NL      160   618   116   182    35     6    41
##  7 cartech02   2016     1 MIL    NL      160   549    84   122    27     1    41
##  8 frazito01   2016     1 CHA    AL      158   590    89   133    21     0    40
##  9 bryankr01   2016     1 CHN    NL      155   603   121   176    35     3    39
## 10 canoro01    2016     1 SEA    AL      161   655   107   195    33     2    39
## # ... with 10 more variables: RBI <int>, SB <int>, CS <int>, BB <int>,
## #   SO <int>, IBB <int>, HBP <int>, SH <int>, SF <int>, GIDP <int>
People %>% as_tibble()
## # A tibble: 20,370 x 26
##    playerID  birthYear birthMonth birthDay birthCountry birthState birthCity   
##    <chr>         <int>      <int>    <int> <chr>        <chr>      <chr>       
##  1 aardsda01      1981         12       27 USA          CO         Denver      
##  2 aaronha01      1934          2        5 USA          AL         Mobile      
##  3 aaronto01      1939          8        5 USA          AL         Mobile      
##  4 aasedo01       1954          9        8 USA          CA         Orange      
##  5 abadan01       1972          8       25 USA          FL         Palm Beach  
##  6 abadfe01       1985         12       17 D.R.         La Romana  La Romana   
##  7 abadijo01      1850         11        4 USA          PA         Philadelphia
##  8 abbated01      1877          4       15 USA          PA         Latrobe     
##  9 abbeybe01      1869         11       11 USA          VT         Essex       
## 10 abbeych01      1866         10       14 USA          NE         Falls City  
## # ... with 20,360 more rows, and 19 more variables: deathYear <int>,
## #   deathMonth <int>, deathDay <int>, deathCountry <chr>, deathState <chr>,
## #   deathCity <chr>, nameFirst <chr>, nameLast <chr>, nameGiven <chr>,
## #   weight <int>, height <int>, bats <fct>, throws <fct>, debut <chr>,
## #   finalGame <chr>, retroID <chr>, bbrefID <chr>, deathDate <date>,
## #   birthDate <date>
top_names <- top %>% left_join(People) %>%
    select(playerID, nameFirst, nameLast, HR)

top_salary <- Salaries %>% filter(yearID == 2016) %>%
  right_join(top_names) %>%
  select(nameFirst, nameLast, teamID, HR, salary)

award_players <- AwardsPlayers %>% filter(yearID == 2016)

q7_a <- semi_join(top,award_players)
q7_b <- anti_join(award_players,top) #there are identical playerID; thus, we need to count unique obs using the code below
length(unique(q7_b$playerID))
## [1] 44

Assessment: Web Scraping

Question 1: Which of the first four nodes are tables of team payroll?

library(rvest)
url <- "https://web.archive.org/web/20181024132313/http://www.stevetheump.com/Payrolls.htm"
h <- read_html(url)

nodes <- html_nodes(h, "table")

html_text(nodes[[6]])
## [1] "# Team\nPayroll1. Los Angeles Dodgers\n$235,295,2192. New York Yankees\n$203,812,5063. Philadelphia Phillies\n$180,052,7234. Boston Red Sox\n$162,817,4115. Detroit Tigers\n$162,228,5276. Los Angeles Angels\n$155,692,0007. San Francisco Giants\n$154,185,8788. Texas Rangers\n$136,036,1729. Washington Nationals\n$134,704,43710. Toronto Blue Jays\n$132,628,70011. Arizona Diamondbacks\n$112,688,66612. Cincinnati Reds\n$112,390,77213. St. Louis Cardinals\n$111,020,36014. Atlanta Braves\n$110,897,34115. Baltimore Orioles\n$107,406,62316. Milwaukee Brewers\n$103,844,80617. Colorado Rockies\n$95,832,07118. Seattle Mariners\n$92,081,94319. Kansas City Royals\n$92,034,34520. Chicago White Sox\n$91,159,25421. San Diego Padres\n$90,094,19622. New York Mets\n$89,051,75823. Chicago Cubs\n$89,007,85724. Minnesota Twins\n$85,776,50025. Oakland Athletics\n$83,401,40026. Cleveland Indians\n$82,534,80027. Pittsburgh Pirates\n$78,111,66728. Tampa Bay Rays\n$77,062,89129. Miami Marlins\n$47,565,40030. Houston Astros\n$44,544,174"
lapply(nodes[1:4], html_table)
## [[1]]
## # A tibble: 1 x 2
##   X1    X2                                                                      
##   <lgl> <chr>                                                                   
## 1 NA    "Salary Stats 1967-2019\nTop ML Player Salaries / Baseball's Luxury Tax"
## 
## [[2]]
## # A tibble: 30 x 3
##     RANK TEAM                 Payroll  
##    <int> <chr>                <chr>    
##  1     1 Boston Red Sox       $235.65M 
##  2     2 San Francisco Giants $208.51M 
##  3     3 Los Angeles Dodgers  $186.14M 
##  4     4 Chicago Cubs         $183.46M 
##  5     5 Washington Nationals $181.59M 
##  6     6 Los Angeles Angels   $175.1M  
##  7     7 New York Yankees     $168.54M 
##  8     8 Seattle Mariners     $162.48M 
##  9     9 Toronto Blue Jays    $162.316M
## 10    10 St. Louis Cardinals  $161.01M 
## # ... with 20 more rows
## 
## [[3]]
## # A tibble: 31 x 5
##    X1    X2                   X3           X4            X5           
##    <chr> <chr>                <chr>        <chr>         <chr>        
##  1 Rank  Team                 25 Man       Disabled List Total Payroll
##  2 1     Los Angeles Dodgers  $155,887,854 $37,354,166   $242,065,828 
##  3 2     New York Yankees     $168,045,699 $5,644,000    $201,539,699 
##  4 3     Boston Red Sox       $136,780,500 $38,239,250   $199,805,178 
##  5 4     Detroit Tigers       $168,500,600 $11,750,000   $199,750,600 
##  6 5     Toronto Blue Jays    $159,175,968 $2,169,400    $177,795,368 
##  7 6     Texas Rangers        $115,162,703 $39,136,360   $175,909,063 
##  8 7     San Francisco Giants $169,504,611 $2,500,000    $172,354,611 
##  9 8     Chicago Cubs         $170,189,880 $2,000,000    $172,189,880 
## 10 9     Washington Nationals $163,111,918 $535,000      $167,846,918 
## # ... with 21 more rows
## 
## [[4]]
## # A tibble: 30 x 5
##     Rank Team      `Opening Day` `Avg Salary` Median     
##    <int> <chr>     <chr>         <chr>        <chr>      
##  1     1 Dodgers   $ 223,352,402 $ 7,445,080  $ 5,166,666
##  2     2 Yankees   $ 213,472,857 $ 7,361,133  $ 3,300,000
##  3     3 Red Sox   $ 182,161,414 $ 6,072,047  $ 3,500,000
##  4     4 Tigers    $ 172,282,250 $ 6,891,290  $ 3,000,000
##  5     5 Giants    $ 166,495,942 $ 5,946,284  $ 4,000,000
##  6     6 Nationals $ 166,010,977 $ 5,724,516  $ 2,500,000
##  7     7 Angels    $ 146,449,583 $ 5,049,986  $ 1,312,500
##  8     8 Rangers   $ 144,307,373 $ 4,509,605  $ 937,500  
##  9     9 Phillies  $ 133,048,000 $ 4,434,933  $ 700,000  
## 10    10 Blue Jays $ 126,369,628 $ 4,357,573  $ 1,650,000
## # ... with 20 more rows

Question 2: For the last 3 components of nodes, which of the following are true?

node_length <- length(nodes)

html_table(nodes[[node_length]]) #last node
## # A tibble: 54 x 4
##    X1    X2       X3           X4     
##    <chr> <chr>    <chr>        <chr>  
##  1 Year  Minimum  "Average"    "% Chg"
##  2 2019  $555,000 ""           "-"    
##  3 2018  $545,000 "$4,520,000" ""     
##  4 2017  $535,000 "$4,470,000" "5.4"  
##  5 2016  $507,500 "$4,400,000" "-"    
##  6 2015  $507,500 "$4,250,000" "-"    
##  7 2014  $507,500 "$3,820,000" "12.8" 
##  8 2013  $480,000 "$3,386,212" "5.4"  
##  9 2012  $480,000 "$3,440,000" "3.8"  
## 10 2011  $414,500 "$3,305,393" "0.2"  
## # ... with 44 more rows
html_table(nodes[[node_length-1]]) #second to last node
## # A tibble: 31 x 3
##    X1          X2          X3        
##    <chr>       <chr>       <chr>     
##  1 Team        Payroll     Average   
##  2 NY Yankees  $92,538,260 $3,190,974
##  3 Los Angeles $88,124,286 $3,263,862
##  4 Atlanta     $84,537,836 $2,817,928
##  5 Baltimore   $81,447,435 $2,808,532
##  6 Arizona     $81,027,833 $2,893,851
##  7 NY Mets     $79,509,776 $3,180,391
##  8 Boston      $77,940,333 $2,598,011
##  9 Cleveland   $75,880,871 $2,918,495
## 10 Texas       $70,795,921 $2,722,920
## # ... with 21 more rows
html_table(nodes[[node_length-2]]) #third to last node
## # A tibble: 31 x 3
##    X1          X2           X3        
##    <chr>       <chr>        <chr>     
##  1 Team        Payroll      Average   
##  2 NY Yankees  $109,791,893 $3,541,674
##  3 Boston      $109,558,908 $3,423,716
##  4 Los Angeles $108,980,952 $3,757,964
##  5 NY Mets     $93,174,428  $3,327,658
##  6 Cleveland   $91,974,979  $3,065,833
##  7 Atlanta     $91,851,687  $2,962,958
##  8 Texas       $88,504,421  $2,854,981
##  9 Arizona     $81,206,513  $2,900,233
## 10 St. Louis   $77,270,855  $2,664,512
## # ... with 21 more rows

Question 3:

tab_1 <- html_table(nodes[[10]])
tab_1
## # A tibble: 31 x 4
##    X1    X2                    X3           X4        
##    <chr> <chr>                 <chr>        <chr>     
##  1 No.   Team                  Payroll      Average   
##  2 1.    New York Yankees      $206,333,389 $8,253,336
##  3 2.    Boston Red Sox        $162,747,333 $5,611,977
##  4 3.    Chicago Cubs          $146,859,000 $5,439,222
##  5 4.    Philadelphia Phillies $141,927,381 $5,068,835
##  6 5.    New York Mets         $132,701,445 $5,103,902
##  7 6.    Detroit Tigers        $122,864,929 $4,550,553
##  8 7.    Chicago White Sox     $108,273,197 $4,164,354
##  9 8.    Los Angeles Angels    $105,013,667 $3,621,161
## 10 9.    Seattle Mariners      $98,376,667  $3,513,452
## # ... with 21 more rows
tab_2 <- html_table(nodes[[19]])
tab_2
## # A tibble: 31 x 3
##    X1          X2           X3        
##    <chr>       <chr>        <chr>     
##  1 Team        Payroll      Average   
##  2 NY Yankees  $109,791,893 $3,541,674
##  3 Boston      $109,558,908 $3,423,716
##  4 Los Angeles $108,980,952 $3,757,964
##  5 NY Mets     $93,174,428  $3,327,658
##  6 Cleveland   $91,974,979  $3,065,833
##  7 Atlanta     $91,851,687  $2,962,958
##  8 Texas       $88,504,421  $2,854,981
##  9 Arizona     $81,206,513  $2,900,233
## 10 St. Louis   $77,270,855  $2,664,512
## # ... with 21 more rows
tab1_new <- tab_1[-1, -1] 
names(tab1_new) <- c("Team", "Payroll", "Average")

tab2_new <- tab_2[-1,]
names(tab2_new) <- c("Team", "Payroll", "Average")

library(readr)
tab12 <- full_join(tab1_new,tab2_new, by = "Team") %>% 
  nrow()

Question 4-5:

library(rvest)
library(tidyverse)
url <- "https://en.wikipedia.org/w/index.php?title=Opinion_polling_for_the_United_Kingdom_European_Union_membership_referendum&oldid=896735054"
h <- read_html(url)

tab <- html_nodes(h, "table")
lapply(tab[1:42], html_table)
## [[1]]
## # A tibble: 27 x 21
##    X1    X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12   X13  
##    <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 "Par~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  2 "UK ~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  3 ""    <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  4 "Acc~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  5 "197~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  6 "Tre~ Sing~ (UK ~ Maas~ (UK ~ Trea~ (UK ~ Trea~ (UK ~ Trea~ (UK ~ <NA>  <NA> 
##  7 "Sin~ (UK ~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  8 "Maa~ (UK ~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  9 "Tre~ (UK ~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## 10 "Tre~ (UK ~ <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## # ... with 17 more rows, and 8 more variables: X14 <chr>, X15 <chr>, X16 <chr>,
## #   X17 <chr>, X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>
## 
## [[2]]
## # A tibble: 5 x 2
##   X1                        X2               
##   <chr>                     <chr>            
## 1 Single European Act, 1986 (UK ratification)
## 2 Maastricht Treaty, 1992   (UK ratification)
## 3 Treaty of Amsterdam, 1997 (UK ratification)
## 4 Treaty of Nice, 2001      (UK ratification)
## 5 Treaty of Lisbon, 2007    (UK ratification)
## 
## [[3]]
## # A tibble: 10 x 2
##    X1                X2                     
##    <chr>             <chr>                  
##  1 Members 1973–1979 (elected by parliament)
##  2 Members 1979–1984 (1979 election)        
##  3 Members 1984–1989 (1984 election)        
##  4 Members 1989–1994 (1989 election)        
##  5 Members 1994–1999 (1994 election)        
##  6 Members 1999–2004 (1999 election)        
##  7 Members 2004–2009 (2004 election)        
##  8 Members 2009–2014 (2009 election)        
##  9 Members 2014–2019 (2014 election)        
## 10 Members 2019–2020 (2019 election)        
## 
## [[4]]
## # A tibble: 8 x 7
##   `Conducted by`               Date    Remain  Leave   Undecided Lead  Notes    
##   <chr>                        <chr>   <chr>   <chr>   <chr>     <chr> <chr>    
## 1 Conducted by                 Date    ""      ""      Undecided Lead  "Notes"  
## 2 What UK Thinks: EU[14]       23 June "52%"   "48%"   N/A       4%    "Six mos~
## 3 Elections Etc.[15]           23 June "50.6%" "49.4%" N/A       1.2%  "Twelve ~
## 4 HuffPost Pollster[16]        23 June "45.8%" "45.3%" 9%        0.5%  ""       
## 5 Number Cruncher Politics[17] 22 June "46%"   "44%"   10%       2%    "Equal w~
## 6 Financial Times[18]          13 June "48%"   "46%"   6%        2%    "Five mo~
## 7 The Telegraph[20]            21 June "51%"   "49%"   N/A       2%    "Six mos~
## 8 The Economist[21]            6 June  "44%"   "44%"   9%        0%    "Exclude~
## 
## [[5]]
## # A tibble: 1 x 2
##   X1    X2                                                                      
##   <lgl> <chr>                                                                   
## 1 NA    This list is incomplete; you can help by adding missing items.  (Novemb~
## 
## [[6]]
## # A tibble: 134 x 9
##    `Date(s) conducted` Remain  Leave   Undecided Lead  Sample     `Conducted by`
##    <chr>               <chr>   <chr>   <chr>     <chr> <chr>      <chr>         
##  1 Date(s) conducted   ""      ""      Undecided Lead  Sample     Conducted by  
##  2 23 June 2016        "48.1%" "51.9%" N/A       3.8%  33,577,342 Results of th~
##  3 23 June             "52%"   "48%"   N/A       4%    4,772      YouGov        
##  4 22 June             "55%"   "45%"   N/A       10%   4,700      Populus       
##  5 20–22 June          "51%"   "49%"   N/A       2%    3,766      YouGov        
##  6 20–22 June          "49%"   "46%"   1%        3%    1,592      Ipsos MORI    
##  7 20–22 June          "44%"   "45%"   9%        1%    3,011      Opinium       
##  8 17–22 June          "54%"   "46%"   N/A       8%    1,032      ComRes        
##  9 17–22 June          "48%"   "42%"   11%       6%    1,032      ComRes        
## 10 16–22 June          "41%"   "43%"   16%       2%    2,320      TNS           
## # ... with 124 more rows, and 2 more variables: Polling type <chr>, Notes <chr>
## 
## [[7]]
## # A tibble: 73 x 9
##    `Date(s) conducted` Remain Leave Undecided Sample `Conducted by` Notes  ``   
##    <chr>               <chr>  <chr> <chr>     <chr>  <chr>          <chr>  <chr>
##  1 Date(s) conducted   ""     ""    Undecided Sample Conducted by   "Note~ <NA> 
##  2 17–18 Dec           "41%"  "42%" 17%       1,598  YouGov         ""     <NA> 
##  3 12–14 Dec           "58%"  "32%" 10%       529    Ipsos MORI     ""     <NA> 
##  4 11–13 Dec           "56%"  "35%" 8%        1,001  ComRes         ""     <NA> 
##  5 11–13 Dec           "42%"  "41%" 17%       2,053  ICM            ""     <NA> 
##  6 4–6 Dec             "43%"  "39%" 17%       2,022  ICM            ""     <NA> 
##  7 2–3 Dec             "36%"  "43%" 21%       1,001  ORB            ""     <NA> 
##  8 30 Nov–3 Dec        "40%"  "42%" 18%       10,015 Survation      "Incl~ <NA> 
##  9 20–24 Nov           "41%"  "41%" 18%       4,317  YouGov         ""     <NA> 
## 10 19–24 Nov           "40%"  "38%" 22%       1,699  YouGov         ""     <NA> 
## # ... with 63 more rows, and 1 more variable:  <chr>
## 
## [[8]]
## # A tibble: 46 x 9
##    `Date(s) conducted` Remain Leave Undecided Sample `Conducted by` Notes  ``   
##    <chr>               <chr>  <chr> <chr>     <chr>  <chr>          <chr>  <chr>
##  1 Date(s) conducted   ""     ""    Undecided Sample Conducted by   "Note~ <NA> 
##  2 14–15 Dec           "40%"  "39%" 21%       1,648  YouGov         ""     <NA> 
##  3 30 Nov–1 Dec        "42%"  "39%" 20%       1,763  YouGov         ""     <NA> 
##  4 20–26 Nov           "38%"  "43%" 19%       1,641  YouGov         ""     <NA> 
##  5 21–23 Nov           "32%"  "48%" 20%       2,049  ComRes         ""     <NA> 
##  6 20–21 Nov           "40%"  "41%" 19%       1,970  YouGov         ""     <NA> 
##  7 19–21 Nov           "40%"  "41%" 19%       2,314  YouGov         ""     <NA> 
##  8 16–17 Nov           "39%"  "39%" 21%       1,589  YouGov         ""     <NA> 
##  9 7 Nov               "31%"  "54%" 15%       1,020  Survation      ""     <NA> 
## 10 2–3 Nov             "38%"  "41%" 21%       1,652  YouGov         ""     <NA> 
## # ... with 36 more rows, and 1 more variable:  <chr>
## 
## [[9]]
## # A tibble: 40 x 7
##    `Date(s) conducted` Remain Leave Undecided Sample  `Conducted by` Notes  
##    <chr>               <chr>  <chr> <chr>     <chr>   <chr>          <chr>  
##  1 Date(s) conducted   ""     ""    Undecided Sample  Conducted by   "Notes"
##  2 1–9 Dec             "37%"  "43%" 20%       Unknown YouGov         ""     
##  3 10–11 Nov           "39%"  "39%" 22%       Unknown YouGov[35]     ""     
##  4 13–14 Oct           "42%"  "37%" 20%       Unknown YouGov[35]     ""     
##  5 23–27 Sep           "36%"  "44%" 20%       1,922   YouGov         ""     
##  6 15–16 Sep           "42%"  "39%" 20%       Unknown YouGov[35]     ""     
##  7 18–19 Aug           "46%"  "34%" 20%       Unknown YouGov[35]     ""     
##  8 6–8 Aug             "32%"  "53%" 15%       1,945   Opinium        ""     
##  9 4–5 Aug             "43%"  "35%" 22%       Unknown YouGov[35]     ""     
## 10 18–24 Jul           "35%"  "45%" 21%       1,968   YouGov         ""     
## # ... with 30 more rows
## 
## [[10]]
## # A tibble: 3 x 7
##   `Date(s) conducted` Remain Leave Undecided Sample  `Conducted by`   Notes     
##   <chr>               <chr>  <chr> <chr>     <chr>   <chr>            <chr>     
## 1 Date(s) conducted   ""     ""    Undecided Sample  Conducted by     Notes     
## 2 27–28 November      "30%"  "51%" 9%        Unknown YouGov/The Sun   Northern ~
## 3 13–15 November      "30%"  "56%" 14%       1,957   Opinium/Observer Northern ~
## 
## [[11]]
## # A tibble: 4 x 7
##   `Date(s) conducted` Remain Leave Undecided Sample  `Conducted by` Notes       
##   <chr>               <chr>  <chr> <chr>     <chr>   <chr>          <chr>       
## 1 Date(s) conducted   ""     ""    Undecided Sample  Conducted by   Notes       
## 2 15–16 December      "41%"  "41%" 19%       Unknown YouGov/The Sun Northern Ir~
## 3 8–9 December        "35%"  "44%" 20%       Unknown YouGov/The Sun Northern Ir~
## 4 7–8 August          "30%"  "52%" 19%       Unknown YouGov/The Sun Northern Ir~
## 
## [[12]]
## # A tibble: 2 x 7
##   `Date(s) conducted` Remain Leave Undecided Sample  `Conducted by` Notes       
##   <chr>               <chr>  <chr> <chr>     <chr>   <chr>          <chr>       
## 1 Date(s) conducted   ""     ""    Undecided Sample  Conducted by   Notes       
## 2 8–9 September       "33%"  "47%" 19%       Unknown YouGov/The Sun Northern Ir~
## 
## [[13]]
## # A tibble: 3 x 6
##   `Date(s) conducted` Remain  Leave   Undecided Sample `Held by`      
##   <chr>               <chr>   <chr>   <chr>     <chr>  <chr>          
## 1 Date(s) conducted   ""      ""      Undecided Sample Held by        
## 2 23 June 2016        "46.6%" "53.4%" N/A       –      England Results
## 3 9–16 September 2015 "40%"   "43%"   17%       1,712  YouGov         
## 
## [[14]]
## # A tibble: 3 x 6
##   `Date(s) conducted`   Remain  Leave   Undecided Sample `Held by`             
##   <chr>                 <chr>   <chr>   <chr>     <chr>  <chr>                 
## 1 Date(s) conducted     ""      ""      Undecided Sample Held by               
## 2 23 June 2016          "46.7%" "53.3%" N/A       –      Results               
## 3 26 June – 3 July 2015 "42%"   "43%"   15%       956    Panelbase/Sunday Times
## 
## [[15]]
## # A tibble: 7 x 6
##   `Date(s) conducted`   Remain  Leave   Undecided Sample `Held by`              
##   <chr>                 <chr>   <chr>   <chr>     <chr>  <chr>                  
## 1 Date(s) conducted     ""      ""      Undecided Sample Held by                
## 2 23 June 2016          "59.9%" "40.1%" N/A       –      London Results         
## 3 2–6 June 2016         "48%"   "35%"   13%       1,179  YouGov                 
## 4 26 April – 1 May 2016 "51%"   "34%"   14%       1,005  Opinium/Evening Standa~
## 5 4–6 January 2016      "39%"   "34%"   27%       1,156  YouGov/LBC             
## 6 17–19 November 2014   "45%"   "37%"   14%       1,124  YouGov/Evening Standard
## 7 20–25 June 2013       "41%"   "39%"   20%       1,269  YouGov/Evening Standard
## 
## [[16]]
## # A tibble: 35 x 6
##    `Date(s) conducted` Remain  Leave   Undecided Sample `Held by`               
##    <chr>               <chr>   <chr>   <chr>     <chr>  <chr>                   
##  1 Date(s) conducted   ""      ""      Undecided Sample Held by                 
##  2 23 June 2016        "62.0%" "38.0%" N/A       –      Scotland Results        
##  3 6–12 Jun 2016       "58%"   "33%"   8%        1,000  Ipsos Mori/STV          
##  4 4–22 May 2016       "53%"   "24%"   23%       1,008  TNS[permanent dead link]
##  5 6–10 May 2016       "54%"   "32%"   14%       1,000  ICM/The Scotsman        
##  6 1–2 May 2016        "58%"   "19%"   19%       1,024  Survation/Daily Record  
##  7 23–28 April 2016    "57%"   "33%"   11%       1,074  Panelbase/Sunday Times  
##  8 18–25 April 2016    "66%"   "29%"   5%        1,015  Ipsos MORI/STV          
##  9 1–24 April 2016     "48%"   "21%"   31%       1,012  TNS                     
## 10 15–20 April 2016    "54%"   "28%"   17%       1,005  Survation/Daily Record  
## # ... with 25 more rows
## 
## [[17]]
## # A tibble: 16 x 6
##    `Date(s) conducted`   Remain  Leave   Undecided Sample `Held by`        
##    <chr>                 <chr>   <chr>   <chr>     <chr>  <chr>            
##  1 Date(s) conducted     ""      ""      Undecided Sample Held by          
##  2 23 June 2016          "47.5%" "52.5%" N/A       –      Wales Results    
##  3 30 May – 2 June 2016  "41%"   "41%"   18%       1,017  YouGov           
##  4 7–11 April 2016       "38%"   "39%"   16%       1,011  YouGov           
##  5 9–11 February 2016    "37%"   "45%"   18%       1,024  YouGov           
##  6 21–24 September 2015  "42%"   "38%"   21%       1,010  YouGov           
##  7 4–6 May 2015          "47%"   "33%"   16%       1,202  YouGov/ITV Wales 
##  8 24–27 March 2015      "44%"   "38%"   14%       1,189  YouGov/ITV Wales 
##  9 5–9 March 2015        "43%"   "36%"   17%       1,279  YouGov/ITV Wales 
## 10 19–26 February 2015   "63%"   "33%"   4%        1,000  ICM/BBC          
## 11 19–21 January 2015    "44%"   "36%"   16%       1,036  YouGov/ITV Wales 
## 12 2–5 December 2014     "42%"   "39%"   15%       1,131  YouGov/ITV Wales 
## 13 8–11 September 2014   "43%"   "37%"   15%       1,025  YouGov/ITV Wales 
## 14 26 June – 1 July 2014 "41%"   "36%"   18%       1,035  YouGov/ITV Wales 
## 15 21–24 February 2014   "54%"   "40%"   6%        1,000  ICM/BBC          
## 16 14–25 June 2013       "29%"   "37%"   35%       1,015  Beaufort Research
## 
## [[18]]
## # A tibble: 1 x 2
##   X1    X2                                                                      
##   <lgl> <chr>                                                                   
## 1 NA    This section needs additional citations for verification. Please help i~
## 
## [[19]]
## # A tibble: 8 x 7
##   `Date(s) conducted` Remain  Leave   Undecided Sample     `Held by`    Notes   
##   <chr>               <chr>   <chr>   <chr>     <chr>      <chr>        <chr>   
## 1 Date(s) conducted   ""      ""      Undecided Sample     Held by      "Notes" 
## 2 23 June 2016        "55.8%" "44.2%" N/A       –          Northern Ir~ ""      
## 3 Late June 2016      "37%"   "26%"   <NA>      Over 1,000 Belfast Tel~ ""      
## 4 20 June 2016        "57%"   "43%"   Exc. DKs  2,090      The NI Sun/~ ""      
## 5 17–19 May 2016      "57%"   "35%"   9%        1,090      LucidTalk    ""      
## 6 May 2016            "44%"   "20%"   35%       1,005      Ipsos MORI   "Questi~
## 7 19–21 October 2015  "56.5%" "28.3%" 15.2%     2,517      LucidTalk    ""      
## 8 2–16 October 2015   "55%"   "13%"   32%       1,012      BBC/RTÉ      ""      
## 
## [[20]]
## # A tibble: 4 x 6
##   `Date(s) conducted` Remain  Leave  Undecided Sample `Held by`          
##   <chr>               <chr>   <chr>  <chr>     <chr>  <chr>              
## 1 Date(s) conducted   ""      ""     Undecided Sample Held by            
## 2 23 June 2016        "95.9%" "4.1%" N/A       –      Gibraltar Results  
## 3 13–15 May 2016      "94%"   "2%"   4%        596    Gibraltar Chronicle
## 4 11–15 April 2016    "88%"   "8%"   3%        596    Gibraltar Chronicle
## 
## [[21]]
## # A tibble: 33 x 7
##    `Date(s) conducted` Remain Leave Undecided Sample `Held by`      Notes       
##    <chr>               <chr>  <chr> <chr>     <chr>  <chr>          <chr>       
##  1 Date(s) conducted   ""     ""    Undecided Sample Held by        "Notes"     
##  2 1–2 June 2015       "55%"  "24%" 18%       1,063  YouGov/Prospe~ "Northern I~
##  3 8–9 May 2015        "58%"  "24%" 16%       1,302  YouGov/Sunday~ "Northern I~
##  4 3–4 May 2015        "56%"  "20%" 20%       1,664  YouGov/The Sun "Northern I~
##  5 19–20 April 2015    "57%"  "22%" 17%       2,078  YouGov/The Sun "Northern I~
##  6 22–23 March 2015    "57%"  "22%" 18%       1,641  YouGov/The Sun "Northern I~
##  7 22–23 February 2015 "57%"  "21%" 17%       1,772  YouGov/The Sun "Northern I~
##  8 25–26 January 2015  "54%"  "25%" 16%       1,656  YouGov/The Sun "Northern I~
##  9 18–19 January 2015  "57%"  "21%" 19%       1,747  YouGov/Britis~ "Northern I~
## 10 14–15 Dec 2014      "55%"  "24%" 16%       1,648  YouGov/The Sun ""          
## # ... with 23 more rows
## 
## [[22]]
## # A tibble: 29 x 4
##    Country        Remain `Does not matter` Leave
##    <chr>          <chr>  <chr>             <chr>
##  1 Country        ""     ""                ""   
##  2 Austria        "41%"  "41%"             "19%"
##  3 Belgium        "49%"  "38%"             "13%"
##  4 Bulgaria       "67%"  "27%"             "7%" 
##  5 Croatia        "49%"  "41%"             "10%"
##  6 Cyprus         "35%"  "45%"             "19%"
##  7 Czech Republic "40%"  "47%"             "13%"
##  8 Denmark        "56%"  "31%"             "13%"
##  9 Estonia        "65%"  "28%"             "8%" 
## 10 Finland        "50%"  "39%"             "11%"
## # ... with 19 more rows
## 
## [[23]]
## # A tibble: 29 x 3
##    Country        Remain Leave
##    <chr>          <chr>  <chr>
##  1 Country        ""     ""   
##  2 Austria        "24%"  "76%"
##  3 Belgium        "34%"  "66%"
##  4 Bulgaria       "52%"  "48%"
##  5 Croatia        "36%"  "64%"
##  6 Cyprus         "33%"  "67%"
##  7 Czech Republic "42%"  "58%"
##  8 Denmark        "51%"  "49%"
##  9 Estonia        "44%"  "56%"
## 10 Finland        "30%"  "70%"
## # ... with 19 more rows
## 
## [[24]]
## # A tibble: 10 x 3
##    Country  Remain Leave
##    <chr>    <chr>  <chr>
##  1 Country  ""     ""   
##  2 Denmark  "46%"  "24%"
##  3 Finland  "49%"  "19%"
##  4 France   "51%"  "22%"
##  5 Germany  "55%"  "19%"
##  6 Italy    "63%"  "20%"
##  7 Norway   "34%"  "27%"
##  8 Portugal "74%"  "8%" 
##  9 Spain    "69%"  "11%"
## 10 Sweden   "43%"  "26%"
## 
## [[25]]
## # A tibble: 177 x 11
##    `Date(s) conducted` Right Wrong Undecided Lead  Sample `Conducted by`        
##    <chr>               <chr> <chr> <chr>     <chr> <chr>  <chr>                 
##  1 4–5 Aug 2020        39%   49%   12%       10%   1,606  YouGov                
##  2 30–31 Jul 2020      41%   47%   13%       6%    1,623  YouGov                
##  3 22–23 Jul 2020      42%   47%   11%       5%    1,648  YouGov                
##  4 11–12 Jun 2020      40%   47%   13%       7%    1,693  YouGov                
##  5 29–30 May 2020      42%   45%   13%       3%    1,650  YouGov                
##  6 18–19 May 2020      43%   45%   13%       2%    1,718  YouGov                
##  7 16–17 Apr 2020      43%   44%   13%       1%    2,015  YouGov                
##  8 24–26 Mar 2020      48%   40%   12%       8%    1,010  Number Cruncher Polit~
##  9 9–10 Feb 2020       43%   44%   13%       1%    1,694  YouGov                
## 10 31 Jan – 2 Feb 2020 43%   46%   12%       3%    1,575  YouGov                
## # ... with 167 more rows, and 4 more variables: Polling type <chr>,
## #   Notes <chr>,  <chr>,  <chr>
## 
## [[26]]
## # A tibble: 2 x 9
##   `Date(s) conducted` Right Wrong Undecided Lead  Sample `Conducted by`
##   <chr>               <chr> <chr> <chr>     <chr> <chr>  <chr>         
## 1 26 Feb–1 Mar 2019   22%   70%   8%        48%   5,004  YouGov        
## 2 5–8 Jul 2018        76%   21%   2%        55%   966    YouGov        
## # ... with 2 more variables: Polling type <chr>, Notes <chr>
## 
## [[27]]
## # A tibble: 215 x 9
##    `Date(s) conducted` Remain   Leave   Neither   Lead   Sample  `Conducted by` 
##    <chr>               <chr>    <chr>   <chr>     <chr>  <chr>   <chr>          
##  1 18–21 Oct 2019      55%      45%     —         10%    2,017   Deltapoll      
##  2 17 Oct 2019         EU and ~ EU and~ EU and U~ EU an~ EU and~ EU and UK nego~
##  3 2–14 Oct 2019       32%      54%     14%       22%    26,000  ComRes         
##  4 9–11 Oct 2019       51%      45%     3%        6%     1,622   Panelbase      
##  5 25 Sep 2019         51%      45%     4%        6%     821     Survation      
##  6 5–9 Sep 2019        37%      34%     29%       3%     1,144   Kantar         
##  7 5–7 Sep 2019        46%      40%     14%       6%     2,049   Deltapoll      
##  8 5–6 Sep 2019        52%      45%     3%        7%     864     Panelbase      
##  9 5–6 Sep 2019        50%      44%     6%        6%     809     Survation      
## 10 3–4 Sep 2019        46%      43%     12%       3%     1,533   YouGov         
## # ... with 205 more rows, and 2 more variables: Polling type <chr>, Notes <chr>
## 
## [[28]]
## # A tibble: 28 x 9
##    `Date(s) conducted` Remain Leave Neither Lead  Sample `Conducted by`
##    <chr>               <chr>  <chr> <chr>   <chr> <chr>  <chr>         
##  1 26 Feb-1 Mar 2019   70%    22%   8%      48%   5,004  YouGov        
##  2 26 Feb-1 Mar 2019   69%    16%   15%     50%   5,004  YouGov        
##  3 26 Feb-1 Mar 2019   72%    19%   10%     53%   5,004  YouGov        
##  4 15-22 Feb 2019      76%    14%   10%     62%   499    BMG Research  
##  5 15-22 Feb 2019      48%    22%   30%     26%   1,125  BMG Research  
##  6 2-7 Nov 2018        61%    34%   4%      27%   914    Panelbase     
##  7 30 Oct-2 Nov 2018   45%    41%   14%     4%    1,031  YouGov        
##  8 3-6 Oct 2018        90%    7%    2%      83%   665    YouGov        
##  9 13-18 Sep 2018      90%    7%    3%      83%   1,054  YouGov        
## 10 6-11 Sep 2018       63%    18%   19%     45%   1,645  YouGov        
## # ... with 18 more rows, and 2 more variables: Polling type <chr>, Notes <chr>
## 
## [[29]]
## # A tibble: 66 x 11
##    `Date(s) conducted` Round   Remain   Deal    `No deal`  None   Lead   Sample 
##    <chr>               <chr>   <chr>    <chr>   <chr>      <chr>  <chr>  <chr>  
##  1 17–18 Oct 2019      —       45%      34%     17%        5%     11%    1,025  
##  2 17–18 Oct 2019      —       41%      38%     18%        4%     3%     1,025  
##  3 17–18 Oct 2019      —       42%      42%     —          16%    0%     1,025  
##  4 17 Oct 2019         EU and~ EU and ~ EU and~ EU and UK~ EU an~ EU an~ EU and~
##  5 2–14 Oct 2019       —       42%      30%     20%        8%     12%    26,000 
##  6 5–6 Sep 2019        —       55%      —       41%        5%     14%    864    
##  7 24 Jul 2019         Boris ~ Boris J~ Boris ~ Boris Joh~ Boris~ Boris~ Boris ~
##  8 2–5 Jul 2019        —       —        26%     34%        40%    8%     1,532  
##  9 2–5 Jul 2019        —       43%      25%     —          32%    18%    1,532  
## 10 2–5 Jul 2019        —       44%      —       38%        18%    6%     1,532  
## # ... with 56 more rows, and 3 more variables: Conducted by <chr>,
## #   Polling type <chr>, Notes <chr>
## 
## [[30]]
## # A tibble: 2 x 11
##   `Date(s) conducted` Round Remain Deal  `No deal` None  Lead  Sample
##   <chr>               <chr> <chr>  <chr> <chr>     <chr> <chr> <chr> 
## 1 6-11 Sep 2018       I     58%    10%   9%        23%   48%   1,645 
## 2 6-11 Sep 2018       II    82%    18%   —         —     64%   1,645 
## # ... with 3 more variables: Conducted by <chr>, Polling type <chr>,
## #   Notes <chr>
## 
## [[31]]
## # A tibble: 3 x 8
##   `Date(s) conducted` Join  `Not join` Undecided Lead  Sample `Conducted by`    
##   <chr>               <chr> <chr>      <chr>     <chr> <chr>  <chr>             
## 1 Date(s) conducted   ""    ""         ""        Lead  Sample Conducted by      
## 2 27 Mar 2019         "38%" "38%"      "25%"     0%    1,005  Sky Data          
## 3 27 Mar-5 Apr 2018   "31%" "47%"      "22%"     16%   1,037  Number Cruncher P~
## # ... with 1 more variable: Polling type <chr>
## 
## [[32]]
## # A tibble: 141 x 9
##    `Date(s) conducted` Support   Oppose  Neither  Lead   Sample  `Conducted by` 
##    <chr>               <chr>     <chr>   <chr>    <chr>  <chr>   <chr>          
##  1 17–18 Oct 2019      47%       44%     9%       3%     1,025   Survation      
##  2 17–18 Oct 2019      43%       41%     16%      2%     1,025   Survation      
##  3 17 Oct 2019         EU and U~ EU and~ EU and ~ EU an~ EU and~ EU and UK nego~
##  4 2–14 Oct 2019       41%       45%     14%      4%     26,000  ComRes         
##  5 29–30 Sep 2019      47%       29%     24%      18%    1,620   YouGov         
##  6 29–30 Sep 2019      52%       23%     25%      29%    1,620   YouGov         
##  7 5–9 Sep 2019        53%       29%     18%      24%    1,144   Kantar         
##  8 5–7 Sep 2019        43%       42%     15%      1%     2,049   Deltapoll      
##  9 3–4 Sep 2019        46%       41%     13%      5%     1,533   YouGov         
## 10 29–31 Aug 2019      41%       47%     12%      6%     2,028   Deltapoll      
## # ... with 131 more rows, and 2 more variables: Polling type <chr>, Notes <chr>
## 
## [[33]]
## # A tibble: 21 x 9
##    `Date(s) conducted` Support Oppose Neither Lead  Sample `Conducted by`
##    <chr>               <chr>   <chr>  <chr>   <chr> <chr>  <chr>         
##  1 26 Feb-1 Mar 2019   65%     22%    13%     43%   5,004  YouGov        
##  2 15-22 Feb 2019      54%     19%    27%     35%   499    BMG Research  
##  3 15-22 Feb 2019      38%     25%    37%     13%   1,125  BMG Research  
##  4 30 Oct-2 Nov 2018   41%     49%    10%     8%    1,031  YouGov        
##  5 3-6 Oct 2018        83%     10%    6%      73%   665    YouGov        
##  6 13-18 Sep 2018      86%     8%     6%      78%   1,054  YouGov        
##  7 6-11 Sep 2018       52%     22%    25%     30%   1,645  YouGov        
##  8 30 Aug-5 Sep 2018   56%     33%    10%     23%   620    YouGov        
##  9 30 Aug-5 Sep 2018   66%     22%    11%     44%   1,081  YouGov        
## 10 30 Aug-5 Sep 2018   59%     33%    8%      26%   1,058  YouGov        
## # ... with 11 more rows, and 2 more variables: Polling type <chr>, Notes <chr>
## 
## [[34]]
## # A tibble: 31 x 14
##    vteBrexit  vteBrexit  ``     ``     ``    ``    ``    ``    ``    ``    ``   
##    <chr>      <chr>      <chr>  <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 "Renegoti~ "Renegoti~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  2 "Referend~ "\"Should~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  3 "Referend~ "European~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  4 "Backgrou~ "Treaties~ Treat~ "Rome~ Legi~ "Eur~ Prop~ "Eur~ Elec~ "201~ By-e~
##  5 "Treaties" "Rome\n19~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  6 "Legislat~ "European~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  7 "Proposed~ "European~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  8 "Election~ "2014 Eur~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  9 "By-elect~ "2014 Cla~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## 10 "Other"    "UK acces~ <NA>    <NA>  <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## # ... with 21 more rows, and 3 more variables:  <chr>,  <chr>,  <chr>
## 
## [[35]]
## # A tibble: 6 x 2
##   X1             X2                                                             
##   <chr>          <chr>                                                          
## 1 Treaties       "Rome\n1972 Accession Treaty\nSingle European Act\nMaastricht\~
## 2 Legislation    "European Communities Act 1972\nAmendments: 1986\n1993\n1998\n~
## 3 Proposed bills "European Union Bill 2004–05\nEuropean Communities Act 1972 (R~
## 4 Elections      "2014 European Parliament election\n2015 general election"     
## 5 By-elections   "2014 Clacton\n2014 Heywood and Middleton\n2014 Rochester and ~
## 6 Other          "UK accession\n1973 EC enlargement\nUK membership\n1975 EC mem~
## 
## [[36]]
## # A tibble: 2 x 2
##   X1     X2                                                                     
##   <chr>  <chr>                                                                  
## 1 Remain "Britain Stronger in Europe (official campaign)\nLabour In for Britain"
## 2 Leave  "Vote Leave (official campaign)\nLeave.EU\nBeLeave\nGrassroots Out\nLa~
## 
## [[37]]
## # A tibble: 5 x 2
##   X1                                  X2                                        
##   <chr>                               <chr>                                     
## 1 Political partyleadership elections "Conservative\n2016\n2019\nScottish, Feb ~
## 2 Opposition to Brexit                "Proposed second referendum\nPeople's Vot~
## 3 Elections                           "2017 local\n2017 general\n2018 local\n20~
## 4 By-elections                        "2016 Witney\n2016 Richmond Park\n2016 Sl~
## 5 Other                               "International reactions\nMarch to Leave\~
## 
## [[38]]
## # A tibble: 2 x 2
##   X1                                             X2                             
##   <chr>                                          <chr>                          
## 1 on Northern Ireland andthe Republic of Ireland "Brexit and the Irish border\n~
## 2 Other                                          "on Gibraltar\non the EU\nScie~
## 
## [[39]]
## # A tibble: 3 x 2
##   X1           X2                                                               
##   <chr>        <chr>                                                            
## 1 White papers "Brexit plan\nRepeal Bill plan\nChequers plan\nBrexit withdrawal~
## 2 Enacted      "Notification of Withdrawal Act 2017\nWithdrawal Act 2018 (Gibra~
## 3 Proposed     "Terms of Withdrawal from EU (Referendum) Bills\nUK Withdrawal f~
## 
## [[40]]
## # A tibble: 14 x 10
##    `vte Opinion pol~ `vte Opinion pol~ ``    ``    ``    ``    ``    ``    ``   
##    <chr>             <chr>             <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 General elections "1945\n1950\n195~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  2 Leadership appro~ "2015\n2017\n201~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  3 European electio~ "2009\n2014\n201~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  4 Referendums       "United Kingdom\~ Unit~ "201~ Nort~ "197~ Scot~ "197~ Wales
##  5 United Kingdom    "2011 Alternativ~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  6 Northern Ireland  "1973 border pol~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  7 Scotland          "1979 devolution~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  8 Wales             "1979 devolution~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
##  9 Issues            "Europe\nPost-re~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## 10 Devolved legisla~ "London\n2008\nM~ Lond~ "200~ Nort~ "201~ Scot~ "200~ Wales
## 11 London            "2008\nMayor\n20~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## 12 Northern Ireland  "2016\n2017\n202~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## 13 Scotland          "2007\n2011\n201~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## 14 Wales             "2011\n2016\n202~ <NA>   <NA> <NA>   <NA> <NA>   <NA> <NA> 
## # ... with 1 more variable:  <chr>
## 
## [[41]]
## # A tibble: 4 x 2
##   X1               X2                                                   
##   <chr>            <chr>                                                
## 1 United Kingdom   "2011 Alternative Vote\n2016 EU membership"          
## 2 Northern Ireland "1973 border poll\n1998 Good Friday Agreement"       
## 3 Scotland         "1979 devolution\n1997 devolution\n2014 independence"
## 4 Wales            "1979 devolution\n1997 devolution\n2011 devolution"  
## 
## [[42]]
## # A tibble: 4 x 2
##   X1               X2                                                  
##   <chr>            <chr>                                               
## 1 London           "2008\nMayor\n2012\nMayor\n2016\nMayor\n2021\nMayor"
## 2 Northern Ireland "2016\n2017\n2022\nNext"                            
## 3 Scotland         "2007\n2011\n2016\n2021\nNext"                      
## 4 Wales            "2011\n2016\n2021\nLeadership approval\nNext"
tab[[6]] %>% html_table(fill = TRUE) %>% names() 
## [1] "Date(s) conducted" "Remain"            "Leave"            
## [4] "Undecided"         "Lead"              "Sample"           
## [7] "Conducted by"      "Polling type"      "Notes"

Section 3: String Processing

not_inches <- function(x, smallest = 50, tallest = 84) {
  inches <- suppressWarnings(as.numeric(x))
  ind <- is.na(inches) | inches < smallest | inches > tallest 
  ind
}
not_inches(85)
## [1] TRUE
library(stringr)
pattern <- "\\d|ft"
yes <- c("1", "5 ft", "9")
no <- c("12", "123", " 1", "a4", "b")
s <- c(yes, no)
str_view_all(s, pattern)
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[a-z]"
str_detect(animals, pattern)
## [1]  TRUE  TRUE  TRUE FALSE
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[A-Z]$"
str_detect(animals, pattern)
## [1] FALSE FALSE FALSE  TRUE

Case Study: Extracting a Table from PDF

library(dslabs)
data("research_funding_rates")
research_funding_rates 
##            discipline applications_total applications_men applications_women
## 1   Chemical sciences                122               83                 39
## 2   Physical sciences                174              135                 39
## 3             Physics                 76               67                  9
## 4          Humanities                396              230                166
## 5  Technical sciences                251              189                 62
## 6   Interdisciplinary                183              105                 78
## 7 Earth/life sciences                282              156                126
## 8     Social sciences                834              425                409
## 9    Medical sciences                505              245                260
##   awards_total awards_men awards_women success_rates_total success_rates_men
## 1           32         22           10                26.2              26.5
## 2           35         26            9                20.1              19.3
## 3           20         18            2                26.3              26.9
## 4           65         33           32                16.4              14.3
## 5           43         30           13                17.1              15.9
## 6           29         12           17                15.8              11.4
## 7           56         38           18                19.9              24.4
## 8          112         65           47                13.4              15.3
## 9           75         46           29                14.9              18.8
##   success_rates_women
## 1                25.6
## 2                23.1
## 3                22.2
## 4                19.3
## 5                21.0
## 6                21.8
## 7                14.3
## 8                11.5
## 9                11.2

Assessment Part 2: String Processing Part 3

library(rvest)
library(tidyverse)
library(stringr)
url <- "https://en.wikipedia.org/w/index.php?title=Opinion_polling_for_the_United_Kingdom_European_Union_membership_referendum&oldid=896735054"
tab <- read_html(url) %>% html_nodes("table")
polls <- tab[[6]] %>% html_table(fill = TRUE)

names(polls) <- c("dates", "remain", "leave", "undecided", "lead", "samplesize", "pollster", "poll_type", "notes") #rename columns
library(dplyr) #load dplyr to clean data
pattern <- "^(\\d{2}).*(\\d*)%" #pattern of data in Remain column
polls_tidy <- polls %>% setNames(c("dates", "remain", "leave", "undecided", "lead", "samplesize", "pollster", "poll_type", "notes")) %>% filter(nchar(remain) == "3" | nchar(remain) == "5") 

as.numeric(str_replace(polls_tidy$remain, "%", ""))/100
##   [1] 0.481 0.520 0.550 0.510 0.490 0.440 0.540 0.480 0.410 0.450 0.420 0.530
##  [13] 0.450 0.440 0.440 0.420 0.420 0.370 0.460 0.430 0.390 0.450 0.440 0.460
##  [25] 0.400 0.480 0.530 0.420 0.440 0.450 0.430 0.430 0.480 0.410 0.430 0.400
##  [37] 0.410 0.420 0.440 0.510 0.440 0.440 0.410 0.410 0.450 0.550 0.440 0.440
##  [49] 0.520 0.550 0.470 0.430 0.550 0.380 0.360 0.380 0.440 0.420 0.440 0.430
##  [61] 0.420 0.490 0.390 0.410 0.450 0.430 0.440 0.510 0.510 0.490 0.480 0.430
##  [73] 0.530 0.380 0.400 0.390 0.350 0.450 0.420 0.400 0.390 0.440 0.510 0.390
##  [85] 0.350 0.410 0.510 0.450 0.490 0.400 0.480 0.410 0.460 0.470 0.430 0.450
##  [97] 0.480 0.490 0.400 0.400 0.400 0.390 0.410 0.390 0.480 0.480 0.370 0.380
## [109] 0.420 0.510 0.450 0.400 0.540 0.360 0.430 0.490 0.410 0.360 0.420 0.380
## [121] 0.550 0.440 0.540 0.410 0.520 0.420 0.380 0.420 0.440
parse_number(polls_tidy$remain)/100
##   [1] 0.481 0.520 0.550 0.510 0.490 0.440 0.540 0.480 0.410 0.450 0.420 0.530
##  [13] 0.450 0.440 0.440 0.420 0.420 0.370 0.460 0.430 0.390 0.450 0.440 0.460
##  [25] 0.400 0.480 0.530 0.420 0.440 0.450 0.430 0.430 0.480 0.410 0.430 0.400
##  [37] 0.410 0.420 0.440 0.510 0.440 0.440 0.410 0.410 0.450 0.550 0.440 0.440
##  [49] 0.520 0.550 0.470 0.430 0.550 0.380 0.360 0.380 0.440 0.420 0.440 0.430
##  [61] 0.420 0.490 0.390 0.410 0.450 0.430 0.440 0.510 0.510 0.490 0.480 0.430
##  [73] 0.530 0.380 0.400 0.390 0.350 0.450 0.420 0.400 0.390 0.440 0.510 0.390
##  [85] 0.350 0.410 0.510 0.450 0.490 0.400 0.480 0.410 0.460 0.470 0.430 0.450
##  [97] 0.480 0.490 0.400 0.400 0.400 0.390 0.410 0.390 0.480 0.480 0.370 0.380
## [109] 0.420 0.510 0.450 0.400 0.540 0.360 0.430 0.490 0.410 0.360 0.420 0.380
## [121] 0.550 0.440 0.540 0.410 0.520 0.420 0.380 0.420 0.440
str_replace(polls_tidy$undecided,"N/A","0")
##   [1] "0"   "0"   "0"   "0"   "1%"  "9%"  "0"   "11%" "16%" "11%" "13%" "2%" 
##  [13] "13%" "9%"  "12%" "9%"  "13%" "16%" "11%" "3%"  "15%" "5%"  "7%"  "9%" 
##  [25] "13%" "3%"  "0"   "11%" "13%" "0"   "11%" "9%"  "5%"  "11%" "16%" "16%"
##  [37] "13%" "15%" "9%"  "3%"  "12%" "18%" "13%" "16%" "10%" "3%"  "14%" "12%"
##  [49] "7%"  "5%"  "14%" "10%" "5%"  "21%" "22%" "16%" "11%" "13%" "11%" "11%"
##  [61] "14%" "0"   "26%" "13%" "17%" "13%" "10%" "6%"  "9%"  "8%"  "11%" "13%"
##  [73] "6%"  "28%" "16%" "17%" "30%" "17%" "12%" "16%" "18%" "13%" "5%"  "18%"
##  [85] "30%" "14%" "0"   "12%" "10%" "19%" "11%" "17%" "19%" "4%"  "16%" "16%"
##  [97] "7%"  "15%" "19%" "18%" "19%" "19%" "18%" "18%" "15%" "0"   "25%" "25%"
## [109] "17%" "10%" "23%" "19%" "10%" "25%" "18%" "10%" "17%" "19%" "19%" "20%"
## [121] "9%"  "14%" "10%" "18%" "0"   "17%" "22%" "12%" "18%"
temp <- str_extract_all(polls_tidy$dates, "\\d+\\s[a-zA-Z]{3,5}")
end_date <- sapply(temp, function(x) x[length(x)]) # take last element (handles polls that cross month boundaries)
end_date
##   [1] "23 June" "23 June" "22 June" "22 June" "22 June" "22 June" "22 June"
##   [8] "22 June" "22 June" "20 June" "19 June" "19 June" "18 June" "17 June"
##  [15] "17 June" "16 June" "15 June" "15 June" "15 June" "14 June" "13 June"
##  [22] "13 June" "13 June" "13 June" "13 June" "12 June" "12 June" "10 June"
##  [29] "10 June" "9 June"  "6 June"  "5 June"  "5 June"  "3 June"  "3 June" 
##  [36] "3 June"  "31 May"  "29 May"  "29 May"  "29 May"  "25 May"  "24 May" 
##  [43] "24 May"  "23 May"  "22 May"  "22 May"  "19 May"  "17 May"  "17 May" 
##  [50] "16 May"  "15 May"  "15 May"  "15 May"  "12 May"  "12 May"  "12 May" 
##  [57] "8 May"   "6 May"   "3 May"   "29 Apr"  "29 Apr"  "29 Apr"  "28 Apr" 
##  [64] "26 Apr"  "26 Apr"  "26 Apr"  "24 Apr"  "24 Apr"  "19 Apr"  "18 Apr" 
##  [71] "17 Apr"  "17 Apr"  "17 Apr"  "14 Apr"  "14 Apr"  "12 Apr"  "11 Apr" 
##  [78] "10 Apr"  "10 Apr"  "7 Apr"   "4 Apr"   "3 Apr"   "3 Apr"   "1 Apr"  
##  [85] "29 Mar"  "29 Mar"  "28 Mar"  "24 Mar"  "22 Mar"  "22 Mar"  "20 Mar" 
##  [92] "20 Mar"  "19 Mar"  "14 Mar"  "13 Mar"  "11 Mar"  "10 Mar"  "6 Mar"  
##  [99] "6 Mar"   "3 Mar"   "2 Mar"   "1 Mar"   "29 Feb"  "28 Feb"  "28 Feb" 
## [106] "25 Feb"  "23 Feb"  "23 Feb"  "22 Feb"  "22 Feb"  "20 Feb"  "19 Feb" 
## [113] "16 Feb"  "15 Feb"  "14 Feb"  "14 Feb"  "7 Feb"   "4 Feb"   "31 Jan" 
## [120] "28 Jan"  "25 Jan"  "25 Jan"  "24 Jan"  "24 Jan"  "21 Jan"  "17 Jan" 
## [127] "16 Jan"  "14 Jan"  "10 Jan"

Section 4: Dates, Times, and Text Mining

Assessment Part 1: Dates, Times, and Text Mining

Brexit data

library(dslabs)
library(lubridate)
options(digits = 3)    # 3 significant digits

dates <- c("09-01-02", "01-12-07", "02-03-04")
dmy(dates) #either way => need more info
## [1] "2002-01-09" "2007-12-01" "2004-03-02"
data(brexit_polls)
month <- as.data.frame(months(brexit_polls$startdate))
sum(with(month,months(brexit_polls$startdate) == "April"))
## [1] 25
enddate <- as.data.frame(round_date(brexit_polls$enddate,"week"))
sum(with(enddate,round_date(brexit_polls$enddate,"week") == "2016-06-12"))
## [1] 14
sum(weekdays(brexit_polls$enddate) == "Monday")
## [1] 20
sum(weekdays(brexit_polls$enddate) == "Tuesday")
## [1] 23
sum(weekdays(brexit_polls$enddate) == "Wednesday")
## [1] 12
sum(weekdays(brexit_polls$enddate) == "Thursday")
## [1] 17
sum(weekdays(brexit_polls$enddate) == "Friday")
## [1] 14
sum(weekdays(brexit_polls$enddate) == "Saturday")
## [1] 4
sum(weekdays(brexit_polls$enddate) == "Sunday")
## [1] 37
table(weekdays(brexit_polls$enddate))
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##        14        20         4        37        17        23        12

Movie Lens data

data("movielens")
library(dplyr)
movielens <- movielens %>% mutate(datetime = as_datetime(movielens$timestamp)) %>%
   count(year(datetime)) %>%
   arrange(desc(n))

Assessment Part 2: Dates, Times, and Text Mining

Pride and Prejudice

library(tidyverse)
library(gutenbergr)
library(tidytext)
library(Rcpp)
options(digits = 3)

library(dplyr)
gutenberg_metadata %>%
  filter(str_detect(title, "Pride and Prejudice")) 
## # A tibble: 6 x 8
##   gutenberg_id title   author  gutenberg_autho~ language gutenberg_books~ rights
##          <int> <chr>   <chr>              <int> <chr>    <chr>            <chr> 
## 1         1342 Pride ~ Austen~               68 en       Best Books Ever~ Publi~
## 2        20686 Pride ~ Austen~               68 en       Harvard Classic~ Publi~
## 3        20687 Pride ~ Austen~               68 en       Harvard Classic~ Publi~
## 4        26301 Pride ~ Austen~               68 en       Best Books Ever~ Publi~
## 5        37431 Pride ~ <NA>                  NA en       <NA>             Publi~
## 6        42671 Pride ~ Austen~               68 en       Best Books Ever~ Publi~
## # ... with 1 more variable: has_text <lgl>
gutenberg_works(languages = "en") %>% filter(str_detect(title, "Pride and Prejudice"))
## # A tibble: 2 x 8
##   gutenberg_id title   author  gutenberg_autho~ language gutenberg_books~ rights
##          <int> <chr>   <chr>              <int> <chr>    <chr>            <chr> 
## 1         1342 Pride ~ Austen~               68 en       Best Books Ever~ Publi~
## 2        37431 Pride ~ <NA>                  NA en       <NA>             Publi~
## # ... with 1 more variable: has_text <lgl>
words <- gutenberg_download(1342) %>% unnest_tokens(word, text) # total words
words <- gutenberg_download(1342) %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = 'word')

gutenberg_download(1342) %>% unnest_tokens(word, text)  %>% anti_join(stop_words, by = 'word') %>% filter(!str_detect(word, "\\d+")) %>% count(word) %>% filter(n>= 100) %>% arrange(desc(n))
## # A tibble: 24 x 2
##    word          n
##    <chr>     <int>
##  1 elizabeth   596
##  2 darcy       373
##  3 bennet      295
##  4 miss        283
##  5 jane        264
##  6 bingley     258
##  7 time        203
##  8 lady        183
##  9 sister      179
## 10 wickham     162
## # ... with 14 more rows

Afinn

library(textdata)
afinn <- get_sentiments("afinn")

words <- gutenberg_download(1342) %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = 'word') %>% filter(!str_detect(word, "\\d+"))

afinn_sentiments <- words %>% inner_join(afinn) %>% summarise(n = mean(value > 0), n1 = sum(value == 4)) #dataset changed, proportion of > 0 should be 0.588, while the answer is 0.563

Final Assessment: Puerto Rico Hurricane Mortality

library(tidyverse)
library(tidyr)
library(pdftools)
options(digits = 3)    # report 3 significant digits

fn <- system.file("extdata", "RD-Mortality-Report_2015-18-180531.pdf", package="dslabs")
system("cmd.exe", input = paste("start", fn))
## [1] 0
txt <- pdf_text(fn)
x <- str_split(txt[9],"\n")
x; length(x) #number of entry
## [[1]]
##  [1] "6/4/2018                                         Departamento de Salud - Registro Demográfico - División de Calidad y Estadísticas Vitales"                                                                                            
##  [2] ""                                                                                                                                                                                                                                      
##  [3] "SEP     2015    2016     2017    2018"                                                                                                                                                                                                 
##  [4] "    1      75      75       94          0"                                                                                                                                                                                             
##  [5] "    2      77      67       69          0                                                                       Defunciones Ocurridas en Septiembre por Día y Año"                                                                     
##  [6] "    3      67      78       80          0"                                                                                                                                                                                             
##  [7] "                                              140"                                                                                                                                                                                     
##  [8] "    4      71      99       84          0"                                                                                                                                                                                             
##  [9] "    5      62      89       74          0"                                                                                                                                                                                             
## [10] "                                              120"                                                                                                                                                                                     
## [11] "    6      77      74       83          0"                                                                                                                                                                                             
## [12] "    7      85      67       87          0"                                                                                                                                                                                             
## [13] "    8      84      77       94          0     100"                                                                                                                                                                                     
## [14] "    9      79      90       72          0"                                                                                                                                                                                             
## [15] "   10      66      73       98          0      80"                                                                                                                                                                                     
## [16] "   11      92      78       92          0"                                                                                                                                                                                             
## [17] "   12      79      66       80          0      60"                                                                                                                                                                                     
## [18] "   13      81      88      100          0"                                                                                                                                                                                             
## [19] "   14      70      81       79          0      40"                                                                                                                                                                                     
## [20] "   15      87      91       84          0"                                                                                                                                                                                             
## [21] "   16      70      71       80          0      20"                                                                                                                                                                                     
## [22] "   17      70      68       88          0"                                                                                                                                                                                             
## [23] "   18      76      79       78          0       0"                                                                                                                                                                                     
## [24] "   19      81      82       75          0              1      2      3       4      5      6       7      8      9       10   11   12   13   14      15   16     17   18    19    20   21   22   23   24   25   26   27   28   29   30"
## [25] "   20      69      79      106          0    Fuente: Registro Demográfico - División de Calidad y Estadísticas Vitales              2015          2016         2017        2018"                                                       
## [26] "   21      70      67      124          0"                                                                                                                                                                                             
## [27] "   22      68      97      110          0"                                                                                                                                                                                             
## [28] "   23      70      71      109          0"                                                                                                                                                                                             
## [29] "   24      78      79      122          0"                                                                                                                                                                                             
## [30] "   25      60      75      137          0"                                                                                                                                                                                             
## [31] "   26      76      82      132          0"                                                                                                                                                                                             
## [32] "   27      78      82      122          0"                                                                                                                                                                                             
## [33] "   28      84      81      112          0"                                                                                                                                                                                             
## [34] "   29      83      70      131          0"                                                                                                                                                                                             
## [35] "   30      73      91      132          0"                                                                                                                                                                                             
## [36] "Total    2258    2367     2928          0"                                                                                                                                                                                             
## [37] "Avg        75      79       98          0"                                                                                                                                                                                             
## [38] "Max        92      99      137          0   NOTA:"                                                                                                                                                                                     
## [39] "Min        60      66       69          0   * Año 2017 - Datos preliminares; archivo final en proceso operacional de revisión de calidad y validación."                                                                                
## [40] "Med        76     78.5      93          0   ** Año 2018 - Datos preliminares; Certificados de Defunción registrados en sistema hasta 5/31/2018."                                                                                       
## [41] ""
## [1] 1
s <- x[[1]]
s; length(s)
##  [1] "6/4/2018                                         Departamento de Salud - Registro Demográfico - División de Calidad y Estadísticas Vitales"                                                                                            
##  [2] ""                                                                                                                                                                                                                                      
##  [3] "SEP     2015    2016     2017    2018"                                                                                                                                                                                                 
##  [4] "    1      75      75       94          0"                                                                                                                                                                                             
##  [5] "    2      77      67       69          0                                                                       Defunciones Ocurridas en Septiembre por Día y Año"                                                                     
##  [6] "    3      67      78       80          0"                                                                                                                                                                                             
##  [7] "                                              140"                                                                                                                                                                                     
##  [8] "    4      71      99       84          0"                                                                                                                                                                                             
##  [9] "    5      62      89       74          0"                                                                                                                                                                                             
## [10] "                                              120"                                                                                                                                                                                     
## [11] "    6      77      74       83          0"                                                                                                                                                                                             
## [12] "    7      85      67       87          0"                                                                                                                                                                                             
## [13] "    8      84      77       94          0     100"                                                                                                                                                                                     
## [14] "    9      79      90       72          0"                                                                                                                                                                                             
## [15] "   10      66      73       98          0      80"                                                                                                                                                                                     
## [16] "   11      92      78       92          0"                                                                                                                                                                                             
## [17] "   12      79      66       80          0      60"                                                                                                                                                                                     
## [18] "   13      81      88      100          0"                                                                                                                                                                                             
## [19] "   14      70      81       79          0      40"                                                                                                                                                                                     
## [20] "   15      87      91       84          0"                                                                                                                                                                                             
## [21] "   16      70      71       80          0      20"                                                                                                                                                                                     
## [22] "   17      70      68       88          0"                                                                                                                                                                                             
## [23] "   18      76      79       78          0       0"                                                                                                                                                                                     
## [24] "   19      81      82       75          0              1      2      3       4      5      6       7      8      9       10   11   12   13   14      15   16     17   18    19    20   21   22   23   24   25   26   27   28   29   30"
## [25] "   20      69      79      106          0    Fuente: Registro Demográfico - División de Calidad y Estadísticas Vitales              2015          2016         2017        2018"                                                       
## [26] "   21      70      67      124          0"                                                                                                                                                                                             
## [27] "   22      68      97      110          0"                                                                                                                                                                                             
## [28] "   23      70      71      109          0"                                                                                                                                                                                             
## [29] "   24      78      79      122          0"                                                                                                                                                                                             
## [30] "   25      60      75      137          0"                                                                                                                                                                                             
## [31] "   26      76      82      132          0"                                                                                                                                                                                             
## [32] "   27      78      82      122          0"                                                                                                                                                                                             
## [33] "   28      84      81      112          0"                                                                                                                                                                                             
## [34] "   29      83      70      131          0"                                                                                                                                                                                             
## [35] "   30      73      91      132          0"                                                                                                                                                                                             
## [36] "Total    2258    2367     2928          0"                                                                                                                                                                                             
## [37] "Avg        75      79       98          0"                                                                                                                                                                                             
## [38] "Max        92      99      137          0   NOTA:"                                                                                                                                                                                     
## [39] "Min        60      66       69          0   * Año 2017 - Datos preliminares; archivo final en proceso operacional de revisión de calidad y validación."                                                                                
## [40] "Med        76     78.5      93          0   ** Año 2018 - Datos preliminares; Certificados de Defunción registrados en sistema hasta 5/31/2018."                                                                                       
## [41] ""
## [1] 41
library(stringr)
s <- s %>% str_trim()
s
##  [1] "6/4/2018                                         Departamento de Salud - Registro Demográfico - División de Calidad y Estadísticas Vitales"                                                                                         
##  [2] ""                                                                                                                                                                                                                                   
##  [3] "SEP     2015    2016     2017    2018"                                                                                                                                                                                              
##  [4] "1      75      75       94          0"                                                                                                                                                                                              
##  [5] "2      77      67       69          0                                                                       Defunciones Ocurridas en Septiembre por Día y Año"                                                                      
##  [6] "3      67      78       80          0"                                                                                                                                                                                              
##  [7] "140"                                                                                                                                                                                                                                
##  [8] "4      71      99       84          0"                                                                                                                                                                                              
##  [9] "5      62      89       74          0"                                                                                                                                                                                              
## [10] "120"                                                                                                                                                                                                                                
## [11] "6      77      74       83          0"                                                                                                                                                                                              
## [12] "7      85      67       87          0"                                                                                                                                                                                              
## [13] "8      84      77       94          0     100"                                                                                                                                                                                      
## [14] "9      79      90       72          0"                                                                                                                                                                                              
## [15] "10      66      73       98          0      80"                                                                                                                                                                                     
## [16] "11      92      78       92          0"                                                                                                                                                                                             
## [17] "12      79      66       80          0      60"                                                                                                                                                                                     
## [18] "13      81      88      100          0"                                                                                                                                                                                             
## [19] "14      70      81       79          0      40"                                                                                                                                                                                     
## [20] "15      87      91       84          0"                                                                                                                                                                                             
## [21] "16      70      71       80          0      20"                                                                                                                                                                                     
## [22] "17      70      68       88          0"                                                                                                                                                                                             
## [23] "18      76      79       78          0       0"                                                                                                                                                                                     
## [24] "19      81      82       75          0              1      2      3       4      5      6       7      8      9       10   11   12   13   14      15   16     17   18    19    20   21   22   23   24   25   26   27   28   29   30"
## [25] "20      69      79      106          0    Fuente: Registro Demográfico - División de Calidad y Estadísticas Vitales              2015          2016         2017        2018"                                                       
## [26] "21      70      67      124          0"                                                                                                                                                                                             
## [27] "22      68      97      110          0"                                                                                                                                                                                             
## [28] "23      70      71      109          0"                                                                                                                                                                                             
## [29] "24      78      79      122          0"                                                                                                                                                                                             
## [30] "25      60      75      137          0"                                                                                                                                                                                             
## [31] "26      76      82      132          0"                                                                                                                                                                                             
## [32] "27      78      82      122          0"                                                                                                                                                                                             
## [33] "28      84      81      112          0"                                                                                                                                                                                             
## [34] "29      83      70      131          0"                                                                                                                                                                                             
## [35] "30      73      91      132          0"                                                                                                                                                                                             
## [36] "Total    2258    2367     2928          0"                                                                                                                                                                                          
## [37] "Avg        75      79       98          0"                                                                                                                                                                                          
## [38] "Max        92      99      137          0   NOTA:"                                                                                                                                                                                  
## [39] "Min        60      66       69          0   * Año 2017 - Datos preliminares; archivo final en proceso operacional de revisión de calidad y validación."                                                                             
## [40] "Med        76     78.5      93          0   ** Año 2018 - Datos preliminares; Certificados de Defunción registrados en sistema hasta 5/31/2018."                                                                                    
## [41] ""
s[[1]]
## [1] "6/4/2018                                         Departamento de Salud - Registro Demográfico - División de Calidad y Estadísticas Vitales"
#Question 6
header_index <- str_which(s,"2015")
header_index
## [1]  3 25
#Question 7
header <- header_index [1] %>% str_split("\\s+",simplify = T)
header
##      [,1]
## [1,] "3"
#Question 8
tail_index  <- str_which(s, "Total")
tail_index
## [1] 36
#Question 9
n <- s %>% str_count("\\d+")
sum(n == 1); which(n == 1)
## [1] 2
## [1]  7 10
#Question 10

out <- c(1:header_index, which(n==1), tail_index:length(s))
s <- s[-out]
length(s)
## [1] 30
#Question 12
s <- str_split_fixed(s, "\\s+", n = 6) [,1:5]
s
##       [,1] [,2] [,3] [,4]  [,5]
##  [1,] "1"  "75" "75" "94"  "0" 
##  [2,] "2"  "77" "67" "69"  "0" 
##  [3,] "3"  "67" "78" "80"  "0" 
##  [4,] "4"  "71" "99" "84"  "0" 
##  [5,] "5"  "62" "89" "74"  "0" 
##  [6,] "6"  "77" "74" "83"  "0" 
##  [7,] "7"  "85" "67" "87"  "0" 
##  [8,] "8"  "84" "77" "94"  "0" 
##  [9,] "9"  "79" "90" "72"  "0" 
## [10,] "10" "66" "73" "98"  "0" 
## [11,] "11" "92" "78" "92"  "0" 
## [12,] "12" "79" "66" "80"  "0" 
## [13,] "13" "81" "88" "100" "0" 
## [14,] "14" "70" "81" "79"  "0" 
## [15,] "15" "87" "91" "84"  "0" 
## [16,] "16" "70" "71" "80"  "0" 
## [17,] "17" "70" "68" "88"  "0" 
## [18,] "18" "76" "79" "78"  "0" 
## [19,] "19" "81" "82" "75"  "0" 
## [20,] "20" "69" "79" "106" "0" 
## [21,] "21" "70" "67" "124" "0" 
## [22,] "22" "68" "97" "110" "0" 
## [23,] "23" "70" "71" "109" "0" 
## [24,] "24" "78" "79" "122" "0" 
## [25,] "25" "60" "75" "137" "0" 
## [26,] "26" "76" "82" "132" "0" 
## [27,] "27" "78" "82" "122" "0" 
## [28,] "28" "84" "81" "112" "0" 
## [29,] "29" "83" "70" "131" "0" 
## [30,] "30" "73" "91" "132" "0"
s_df <- as.data.frame(s) %>% filter(!row_number() %in% c(1:2,7,10))
mean(as.numeric(s_df$V3[2:31])) #mean per day 2015
## [1] NA
mean(as.numeric(s_df$V5[2:20])) #sep 1-19
## [1] 0
mean(as.numeric(s_df$V5[21:31])) #sep 20-30
## [1] NA
#Question 13
# tab <- tab %>% gather(year, deaths, -day) %>%
    #mutate(deaths = as.numeric(deaths))
# tab