library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (readr)
urlfile="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
datasets <- read.csv("datasets.csv")
summary(datasets)
## Package Item Title Rows
## Length:1965 Length:1965 Length:1965 Min. : 2
## Class :character Class :character Class :character 1st Qu.: 38
## Mode :character Mode :character Mode :character Median : 121
## Mean : 3890
## 3rd Qu.: 746
## Max. :1414593
## Cols n_binary n_character n_factor
## Min. : 1.00 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 3.00 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 5.00 Median : 0.000 Median : 0.0000 Median : 0.000
## Mean : 13.14 Mean : 2.205 Mean : 0.3221 Mean : 1.268
## 3rd Qu.: 9.00 3rd Qu.: 2.000 3rd Qu.: 0.0000 3rd Qu.: 2.000
## Max. :6831.00 Max. :624.000 Max. :29.0000 Max. :64.000
## n_logical n_numeric CSV Doc
## Min. : 0.00000 Min. : 0.00 Length:1965 Length:1965
## 1st Qu.: 0.00000 1st Qu.: 2.00 Class :character Class :character
## Median : 0.00000 Median : 4.00 Mode :character Mode :character
## Mean : 0.02697 Mean : 11.48
## 3rd Qu.: 0.00000 3rd Qu.: 7.00
## Max. :11.00000 Max. :6830.00
datasets %>% summarize(Avg_n_logical=mean(n_logical), Avg_n_numeric=mean(n_numeric), Median_n_logical=median(n_logical), Median_n_numeric=median(n_numeric) )
## Avg_n_logical Avg_n_numeric Median_n_logical Median_n_numeric
## 1 0.02697201 11.47735 0 4
new_datasets <- select(datasets, "Package", "Item", "Rows", "Cols", "n_binary", "n_character", "n_factor", "n_logical", "n_numeric")
new_datasets_2 <- subset(new_datasets, Rows < 200)
head(new_datasets_2)
## Package Item Rows Cols n_binary n_character n_factor n_logical
## 2 AER ArgentinaCPI 80 2 0 0 0 0
## 4 AER BenderlyZwick 31 5 0 0 0 0
## 5 AER BondYield 60 2 0 0 0 0
## 8 AER ChinaIncome 37 5 0 0 0 0
## 9 AER CigarettesB 46 3 0 0 0 0
## 10 AER CigarettesSW 96 9 2 0 2 0
## n_numeric
## 2 2
## 4 5
## 5 2
## 8 5
## 9 3
## 10 7
colnames(new_datasets_2) <- c("Pack_Name", "item", "n_rows", "n_cols", "no_binary", "no_character", "no_factor", "no_logical", "no_numeric")
head(new_datasets_2)
## Pack_Name item n_rows n_cols no_binary no_character no_factor
## 2 AER ArgentinaCPI 80 2 0 0 0
## 4 AER BenderlyZwick 31 5 0 0 0
## 5 AER BondYield 60 2 0 0 0
## 8 AER ChinaIncome 37 5 0 0 0
## 9 AER CigarettesB 46 3 0 0 0
## 10 AER CigarettesSW 96 9 2 0 2
## no_logical no_numeric
## 2 0 2
## 4 0 5
## 5 0 2
## 8 0 5
## 9 0 3
## 10 0 7
summary(new_datasets_2)
## Pack_Name item n_rows n_cols
## Length:1124 Length:1124 Min. : 2.00 Min. : 1.00
## Class :character Class :character 1st Qu.: 22.00 1st Qu.: 3.00
## Mode :character Mode :character Median : 45.00 Median : 4.00
## Mean : 59.45 Mean : 14.12
## 3rd Qu.: 88.00 3rd Qu.: 7.00
## Max. :199.00 Max. :6831.00
## no_binary no_character no_factor no_logical
## Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. :0.000000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.:0.000000
## Median : 0.000 Median : 0.00 Median : 0.0000 Median :0.000000
## Mean : 1.392 Mean : 0.25 Mean : 0.8025 Mean :0.008897
## 3rd Qu.: 1.000 3rd Qu.: 0.00 3rd Qu.: 1.0000 3rd Qu.:0.000000
## Max. :624.000 Max. :19.00 Max. :28.0000 Max. :2.000000
## no_numeric
## Min. : 0.00
## 1st Qu.: 2.00
## Median : 3.00
## Mean : 13.02
## 3rd Qu.: 6.00
## Max. :6830.00
new_datasets_2 %>% summarize(Avg_no_logical=mean(no_logical), Avg_no_numeric=mean(no_numeric), Median_no_logical=median(no_logical), Median_no_numeric=median(no_numeric) )
## Avg_no_logical Avg_no_numeric Median_no_logical Median_no_numeric
## 1 0.008896797 13.01957 0 3
new_datasets_3 <- new_datasets_2 %>% mutate(Pack_Name = replace(Pack_Name, Pack_Name == 'AER', 'GER'))
head(new_datasets_3)
## Pack_Name item n_rows n_cols no_binary no_character no_factor
## 2 GER ArgentinaCPI 80 2 0 0 0
## 4 GER BenderlyZwick 31 5 0 0 0
## 5 GER BondYield 60 2 0 0 0
## 8 GER ChinaIncome 37 5 0 0 0
## 9 GER CigarettesB 46 3 0 0 0
## 10 GER CigarettesSW 96 9 2 0 2
## no_logical no_numeric
## 2 0 2
## 4 0 5
## 5 0 2
## 8 0 5
## 9 0 3
## 10 0 7
head(new_datasets_3, 120)
## Pack_Name item n_rows n_cols no_binary no_character
## 2 GER ArgentinaCPI 80 2 0 0
## 4 GER BenderlyZwick 31 5 0 0
## 5 GER BondYield 60 2 0 0
## 8 GER ChinaIncome 37 5 0 0
## 9 GER CigarettesB 46 3 0 0
## 10 GER CigarettesSW 96 9 2 0
## 12 GER ConsumerGood 108 3 0 0
## 27 GER Electricity1955 159 8 0 0
## 28 GER Electricity1970 158 8 0 0
## 30 GER Equipment 25 4 0 0
## 31 GER EuroEnergy 20 2 0 0
## 36 GER GermanUnemployment 120 2 0 0
## 38 GER GrowthDJ 121 10 3 0
## 39 GER GrowthSW 65 6 0 0
## 47 GER Journals 180 10 1 1
## 48 GER KleinI 22 9 0 0
## 49 GER Longley 16 4 0 0
## 50 GER ManufactCosts 25 9 0 0
## 55 GER Mortgage 78 16 5 0
## 56 GER MotorCycles 48 2 0 0
## 57 GER MotorCycles2 67 2 0 0
## 60 GER MurderRates 44 8 1 0
## 61 GER NaturalGas 138 10 0 0
## 65 GER OECDGrowth 22 6 0 0
## 66 GER OlympicTV 10 2 0 0
## 67 GER OrangeCounty 76 2 0 0
## 68 GER Parade2005 130 5 2 0
## 71 GER ProgramEffectiveness 32 4 2 0
## 77 GER ShipAccidents 40 5 1 0
## 78 GER SIC33 27 3 0 0
## 80 GER SportsCards 148 9 4 0
## 82 GER StrikeDuration 62 2 0 0
## 85 GER TechChange 41 3 0 0
## 86 GER TradeCredit 21 7 0 0
## 88 GER UKInflation 54 2 0 0
## 89 GER UKNonDurables 136 2 0 0
## 90 GER USAirlines 90 6 0 0
## 91 GER USConsump1950 11 3 1 0
## 92 GER USConsump1979 10 2 0 0
## 93 GER USConsump1993 44 2 0 0
## 94 GER USCrudes 99 3 0 0
## 95 GER USGasB 38 6 0 0
## 96 GER USGasG 36 10 0 0
## 97 GER USInvest 15 4 0 0
## 98 GER USMacroB 146 3 0 0
## 100 GER USMacroSW 193 7 0 0
## 103 GER USMoney 136 3 0 0
## 104 GER USProdIndex 128 2 0 0
## 108 aod antibio 24 3 0 0
## 109 aod cohorts 49 4 0 0
## 110 aod dja 75 6 1 0
## 111 aod lizards 24 6 3 0
## 112 aod mice 20 3 1 0
## 113 aod orob1 16 3 0 0
## 114 aod orob2 21 4 2 0
## 115 aod rabbits 84 3 0 0
## 116 aod rats 32 3 1 0
## 117 aod salmonella 18 2 0 0
## 120 asaur gastricXelox 48 2 1 0
## 122 asaur pancreatic 41 4 1 0
## 123 asaur pancreatic2 41 4 1 0
## 124 asaur pharmacoSmoking 125 14 5 0
## 126 betareg CarTask 155 3 1 0
## 127 betareg FoodExpenditure 38 3 0 0
## 128 betareg GasolineYield 32 6 0 0
## 130 betareg MockJurors 104 3 2 0
## 131 betareg ReadingSkills 44 3 1 0
## 132 betareg StressAnxiety 166 2 0 0
## 134 boot acme 60 3 0 1
## 136 boot aircondit 12 1 0 0
## 137 boot aircondit7 24 1 0 0
## 139 boot aml 23 3 2 0
## 140 boot beaver 100 4 2 0
## 141 boot bigcity 49 2 0 0
## 143 boot breslow 10 5 1 0
## 144 boot calcium 27 2 0 0
## 145 boot cane 180 5 0 0
## 146 boot capability 75 1 0 0
## 147 boot catsM 97 3 0 0
## 148 boot cav 138 2 0 0
## 149 boot cd4 20 2 0 0
## 151 boot city 10 2 0 0
## 152 boot claridge 37 2 0 0
## 153 boot cloth 32 2 0 0
## 154 boot co.transfer 7 2 0 0
## 155 boot coal 191 1 0 0
## 156 boot darwin 15 1 0 0
## 157 boot dogs 7 2 0 0
## 158 boot downs.bc 30 3 0 0
## 159 boot ducks 11 2 0 0
## 160 boot fir 50 3 0 0
## 161 boot frets 25 4 0 0
## 162 boot grav 26 2 1 0
## 163 boot gravity 81 2 0 0
## 164 boot hirose 44 3 1 0
## 165 boot islay 18 1 0 0
## 168 boot motor 94 4 0 0
## 170 boot nitrofen 50 5 0 0
## 171 boot nodal 53 7 6 0
## 172 boot nuclear 32 11 5 0
## 174 boot poisons 48 3 0 0
## 175 boot polar 50 2 0 0
## 176 boot remission 27 3 1 0
## 177 boot salinity 28 4 0 0
## 178 boot survival 14 2 0 0
## 179 boot tau 60 2 0 0
## 180 boot tuna 64 1 0 0
## 181 boot urine 79 7 1 0
## 183 carData Adler 108 3 1 0
## 184 carData AMSsurvey 24 5 2 0
## 185 carData Angell 43 4 0 0
## 186 carData Anscombe 51 4 0 0
## 188 carData Baumann 66 6 0 0
## 190 carData Bfox 30 6 0 0
## 192 carData Burt 27 3 0 0
## 193 carData CanPop 16 2 0 0
## 196 carData Chirot 32 5 0 0
## 199 carData DavisThin 191 7 0 0
## 201 carData Duncan 45 4 0 0
## 202 carData Ericksen 66 9 1 0
## no_factor no_logical no_numeric
## 2 0 0 2
## 4 0 0 5
## 5 0 0 2
## 8 0 0 5
## 9 0 0 3
## 10 2 0 7
## 12 0 0 3
## 27 0 0 8
## 28 0 0 8
## 30 0 0 4
## 31 0 0 2
## 36 0 0 2
## 38 3 0 7
## 39 0 0 6
## 47 3 0 6
## 48 0 0 9
## 49 0 0 4
## 50 0 0 9
## 55 5 0 11
## 56 0 0 2
## 57 0 0 2
## 60 1 0 7
## 61 3 0 7
## 65 0 0 6
## 66 1 0 1
## 67 0 0 2
## 68 3 0 2
## 71 2 0 2
## 77 3 0 2
## 78 0 0 3
## 80 6 0 3
## 82 0 0 2
## 85 0 0 3
## 86 0 0 7
## 88 0 0 2
## 89 0 0 2
## 90 2 0 4
## 91 0 0 3
## 92 0 0 2
## 93 0 0 2
## 94 0 0 3
## 95 0 0 6
## 96 0 0 10
## 97 0 0 4
## 98 0 0 3
## 100 0 0 7
## 103 0 0 3
## 104 0 0 2
## 108 0 0 3
## 109 2 0 2
## 110 3 0 3
## 111 4 0 2
## 112 1 0 2
## 113 1 0 2
## 114 2 0 2
## 115 1 0 2
## 116 1 0 2
## 117 0 0 2
## 120 0 0 2
## 122 4 0 0
## 123 1 0 3
## 124 7 0 7
## 126 1 0 2
## 127 0 0 3
## 128 1 0 5
## 130 2 0 1
## 131 1 0 2
## 132 0 0 2
## 134 0 0 2
## 136 0 0 1
## 137 0 0 1
## 139 0 0 3
## 140 0 0 4
## 141 0 0 2
## 143 1 0 4
## 144 0 0 2
## 145 2 0 3
## 146 0 0 1
## 147 1 0 2
## 148 0 0 2
## 149 0 0 2
## 151 0 0 2
## 152 0 0 2
## 153 0 0 2
## 154 0 0 2
## 155 0 0 1
## 156 0 0 1
## 157 0 0 2
## 158 0 0 3
## 159 0 0 2
## 160 0 0 3
## 161 0 0 4
## 162 1 0 1
## 163 1 0 1
## 164 0 0 3
## 165 0 0 1
## 168 0 0 4
## 170 0 0 5
## 171 0 0 7
## 172 0 0 11
## 174 2 0 1
## 175 0 0 2
## 176 0 0 3
## 177 0 0 4
## 178 0 0 2
## 179 1 0 1
## 180 0 0 1
## 181 0 0 7
## 183 2 0 1
## 184 3 0 2
## 185 1 0 3
## 186 0 0 4
## 188 1 0 5
## 190 0 0 6
## 192 1 0 2
## 193 0 0 2
## 196 0 0 5
## 199 0 0 7
## 201 1 0 3
## 202 1 0 8
##7. BONUS – place the original .csv in a github file and have R read from the link. This will be a very ## useful skill as you progress in your data science education and career.
library (readr)
urlfile="https://raw.githubusercontent.com/tonyCUNY/test/main/new_datasets_3.csv"
data <- read_csv(url(urlfile))
## Rows: 1124 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Pack_Name, item
## dbl (7): n_rows, n_cols, no_binary, no_character, no_factor, no_logical, no_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 9
## Pack_Name item n_rows n_cols no_bin…¹ no_ch…² no_fa…³ no_lo…⁴ no_nu…⁵
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 GER ArgentinaCPI 80 2 0 0 0 0 2
## 2 GER BenderlyZwick 31 5 0 0 0 0 5
## 3 GER BondYield 60 2 0 0 0 0 2
## 4 GER ChinaIncome 37 5 0 0 0 0 5
## 5 GER CigarettesB 46 3 0 0 0 0 3
## 6 GER CigarettesSW 96 9 2 0 2 0 7
## # … with abbreviated variable names ¹no_binary, ²no_character, ³no_factor,
## # ⁴no_logical, ⁵no_numeric