1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library (readr)
urlfile="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
datasets <- read.csv("datasets.csv")
summary(datasets)
##    Package              Item              Title                Rows        
##  Length:1965        Length:1965        Length:1965        Min.   :      2  
##  Class :character   Class :character   Class :character   1st Qu.:     38  
##  Mode  :character   Mode  :character   Mode  :character   Median :    121  
##                                                           Mean   :   3890  
##                                                           3rd Qu.:    746  
##                                                           Max.   :1414593  
##       Cols            n_binary        n_character         n_factor     
##  Min.   :   1.00   Min.   :  0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:   3.00   1st Qu.:  0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median :   5.00   Median :  0.000   Median : 0.0000   Median : 0.000  
##  Mean   :  13.14   Mean   :  2.205   Mean   : 0.3221   Mean   : 1.268  
##  3rd Qu.:   9.00   3rd Qu.:  2.000   3rd Qu.: 0.0000   3rd Qu.: 2.000  
##  Max.   :6831.00   Max.   :624.000   Max.   :29.0000   Max.   :64.000  
##    n_logical          n_numeric           CSV                Doc           
##  Min.   : 0.00000   Min.   :   0.00   Length:1965        Length:1965       
##  1st Qu.: 0.00000   1st Qu.:   2.00   Class :character   Class :character  
##  Median : 0.00000   Median :   4.00   Mode  :character   Mode  :character  
##  Mean   : 0.02697   Mean   :  11.48                                        
##  3rd Qu.: 0.00000   3rd Qu.:   7.00                                        
##  Max.   :11.00000   Max.   :6830.00
datasets %>% summarize(Avg_n_logical=mean(n_logical), Avg_n_numeric=mean(n_numeric), Median_n_logical=median(n_logical), Median_n_numeric=median(n_numeric) )
##   Avg_n_logical Avg_n_numeric Median_n_logical Median_n_numeric
## 1    0.02697201      11.47735                0                4

2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.

new_datasets <- select(datasets, "Package", "Item", "Rows", "Cols", "n_binary", "n_character", "n_factor", "n_logical", "n_numeric")
new_datasets_2 <- subset(new_datasets, Rows < 200)
head(new_datasets_2)
##    Package          Item Rows Cols n_binary n_character n_factor n_logical
## 2      AER  ArgentinaCPI   80    2        0           0        0         0
## 4      AER BenderlyZwick   31    5        0           0        0         0
## 5      AER     BondYield   60    2        0           0        0         0
## 8      AER   ChinaIncome   37    5        0           0        0         0
## 9      AER   CigarettesB   46    3        0           0        0         0
## 10     AER  CigarettesSW   96    9        2           0        2         0
##    n_numeric
## 2          2
## 4          5
## 5          2
## 8          5
## 9          3
## 10         7

3. Create new column names for the new data frame

colnames(new_datasets_2) <- c("Pack_Name", "item", "n_rows", "n_cols", "no_binary", "no_character", "no_factor", "no_logical", "no_numeric")
head(new_datasets_2)
##    Pack_Name          item n_rows n_cols no_binary no_character no_factor
## 2        AER  ArgentinaCPI     80      2         0            0         0
## 4        AER BenderlyZwick     31      5         0            0         0
## 5        AER     BondYield     60      2         0            0         0
## 8        AER   ChinaIncome     37      5         0            0         0
## 9        AER   CigarettesB     46      3         0            0         0
## 10       AER  CigarettesSW     96      9         2            0         2
##    no_logical no_numeric
## 2           0          2
## 4           0          5
## 5           0          2
## 8           0          5
## 9           0          3
## 10          0          7

4. Use the summary function to create an overview of your new data frame. The print the mean

and median for the same two attributes. Please compare.

summary(new_datasets_2)
##   Pack_Name             item               n_rows           n_cols       
##  Length:1124        Length:1124        Min.   :  2.00   Min.   :   1.00  
##  Class :character   Class :character   1st Qu.: 22.00   1st Qu.:   3.00  
##  Mode  :character   Mode  :character   Median : 45.00   Median :   4.00  
##                                        Mean   : 59.45   Mean   :  14.12  
##                                        3rd Qu.: 88.00   3rd Qu.:   7.00  
##                                        Max.   :199.00   Max.   :6831.00  
##    no_binary        no_character     no_factor         no_logical      
##  Min.   :  0.000   Min.   : 0.00   Min.   : 0.0000   Min.   :0.000000  
##  1st Qu.:  0.000   1st Qu.: 0.00   1st Qu.: 0.0000   1st Qu.:0.000000  
##  Median :  0.000   Median : 0.00   Median : 0.0000   Median :0.000000  
##  Mean   :  1.392   Mean   : 0.25   Mean   : 0.8025   Mean   :0.008897  
##  3rd Qu.:  1.000   3rd Qu.: 0.00   3rd Qu.: 1.0000   3rd Qu.:0.000000  
##  Max.   :624.000   Max.   :19.00   Max.   :28.0000   Max.   :2.000000  
##    no_numeric     
##  Min.   :   0.00  
##  1st Qu.:   2.00  
##  Median :   3.00  
##  Mean   :  13.02  
##  3rd Qu.:   6.00  
##  Max.   :6830.00
new_datasets_2 %>% summarize(Avg_no_logical=mean(no_logical), Avg_no_numeric=mean(no_numeric), Median_no_logical=median(no_logical), Median_no_numeric=median(no_numeric) )
##   Avg_no_logical Avg_no_numeric Median_no_logical Median_no_numeric
## 1    0.008896797       13.01957                 0                 3

Avg_no_logical decrease

Avg_no_numeric increase

Median_no_logical no change

Median_no_numeric decrease

5. For at least 3 values in a column please rename so that every value in that column is renamed.

For example, suppose I have 20 values of the letter “e” in one column. Rename those values so

that all 20 would show as “excellent”.

new_datasets_3 <- new_datasets_2 %>% mutate(Pack_Name = replace(Pack_Name, Pack_Name == 'AER', 'GER'))
head(new_datasets_3)
##    Pack_Name          item n_rows n_cols no_binary no_character no_factor
## 2        GER  ArgentinaCPI     80      2         0            0         0
## 4        GER BenderlyZwick     31      5         0            0         0
## 5        GER     BondYield     60      2         0            0         0
## 8        GER   ChinaIncome     37      5         0            0         0
## 9        GER   CigarettesB     46      3         0            0         0
## 10       GER  CigarettesSW     96      9         2            0         2
##    no_logical no_numeric
## 2           0          2
## 4           0          5
## 5           0          2
## 8           0          5
## 9           0          3
## 10          0          7

6. Display enough rows to see examples of all of steps 1-5 above

head(new_datasets_3, 120)
##     Pack_Name                 item n_rows n_cols no_binary no_character
## 2         GER         ArgentinaCPI     80      2         0            0
## 4         GER        BenderlyZwick     31      5         0            0
## 5         GER            BondYield     60      2         0            0
## 8         GER          ChinaIncome     37      5         0            0
## 9         GER          CigarettesB     46      3         0            0
## 10        GER         CigarettesSW     96      9         2            0
## 12        GER         ConsumerGood    108      3         0            0
## 27        GER      Electricity1955    159      8         0            0
## 28        GER      Electricity1970    158      8         0            0
## 30        GER            Equipment     25      4         0            0
## 31        GER           EuroEnergy     20      2         0            0
## 36        GER   GermanUnemployment    120      2         0            0
## 38        GER             GrowthDJ    121     10         3            0
## 39        GER             GrowthSW     65      6         0            0
## 47        GER             Journals    180     10         1            1
## 48        GER               KleinI     22      9         0            0
## 49        GER              Longley     16      4         0            0
## 50        GER        ManufactCosts     25      9         0            0
## 55        GER             Mortgage     78     16         5            0
## 56        GER          MotorCycles     48      2         0            0
## 57        GER         MotorCycles2     67      2         0            0
## 60        GER          MurderRates     44      8         1            0
## 61        GER           NaturalGas    138     10         0            0
## 65        GER           OECDGrowth     22      6         0            0
## 66        GER            OlympicTV     10      2         0            0
## 67        GER         OrangeCounty     76      2         0            0
## 68        GER           Parade2005    130      5         2            0
## 71        GER ProgramEffectiveness     32      4         2            0
## 77        GER        ShipAccidents     40      5         1            0
## 78        GER                SIC33     27      3         0            0
## 80        GER          SportsCards    148      9         4            0
## 82        GER       StrikeDuration     62      2         0            0
## 85        GER           TechChange     41      3         0            0
## 86        GER          TradeCredit     21      7         0            0
## 88        GER          UKInflation     54      2         0            0
## 89        GER        UKNonDurables    136      2         0            0
## 90        GER           USAirlines     90      6         0            0
## 91        GER        USConsump1950     11      3         1            0
## 92        GER        USConsump1979     10      2         0            0
## 93        GER        USConsump1993     44      2         0            0
## 94        GER             USCrudes     99      3         0            0
## 95        GER               USGasB     38      6         0            0
## 96        GER               USGasG     36     10         0            0
## 97        GER             USInvest     15      4         0            0
## 98        GER             USMacroB    146      3         0            0
## 100       GER            USMacroSW    193      7         0            0
## 103       GER              USMoney    136      3         0            0
## 104       GER          USProdIndex    128      2         0            0
## 108       aod              antibio     24      3         0            0
## 109       aod              cohorts     49      4         0            0
## 110       aod                  dja     75      6         1            0
## 111       aod              lizards     24      6         3            0
## 112       aod                 mice     20      3         1            0
## 113       aod                orob1     16      3         0            0
## 114       aod                orob2     21      4         2            0
## 115       aod              rabbits     84      3         0            0
## 116       aod                 rats     32      3         1            0
## 117       aod           salmonella     18      2         0            0
## 120     asaur         gastricXelox     48      2         1            0
## 122     asaur           pancreatic     41      4         1            0
## 123     asaur          pancreatic2     41      4         1            0
## 124     asaur      pharmacoSmoking    125     14         5            0
## 126   betareg              CarTask    155      3         1            0
## 127   betareg      FoodExpenditure     38      3         0            0
## 128   betareg        GasolineYield     32      6         0            0
## 130   betareg           MockJurors    104      3         2            0
## 131   betareg        ReadingSkills     44      3         1            0
## 132   betareg        StressAnxiety    166      2         0            0
## 134      boot                 acme     60      3         0            1
## 136      boot            aircondit     12      1         0            0
## 137      boot           aircondit7     24      1         0            0
## 139      boot                  aml     23      3         2            0
## 140      boot               beaver    100      4         2            0
## 141      boot              bigcity     49      2         0            0
## 143      boot              breslow     10      5         1            0
## 144      boot              calcium     27      2         0            0
## 145      boot                 cane    180      5         0            0
## 146      boot           capability     75      1         0            0
## 147      boot                catsM     97      3         0            0
## 148      boot                  cav    138      2         0            0
## 149      boot                  cd4     20      2         0            0
## 151      boot                 city     10      2         0            0
## 152      boot             claridge     37      2         0            0
## 153      boot                cloth     32      2         0            0
## 154      boot          co.transfer      7      2         0            0
## 155      boot                 coal    191      1         0            0
## 156      boot               darwin     15      1         0            0
## 157      boot                 dogs      7      2         0            0
## 158      boot             downs.bc     30      3         0            0
## 159      boot                ducks     11      2         0            0
## 160      boot                  fir     50      3         0            0
## 161      boot                frets     25      4         0            0
## 162      boot                 grav     26      2         1            0
## 163      boot              gravity     81      2         0            0
## 164      boot               hirose     44      3         1            0
## 165      boot                islay     18      1         0            0
## 168      boot                motor     94      4         0            0
## 170      boot             nitrofen     50      5         0            0
## 171      boot                nodal     53      7         6            0
## 172      boot              nuclear     32     11         5            0
## 174      boot              poisons     48      3         0            0
## 175      boot                polar     50      2         0            0
## 176      boot            remission     27      3         1            0
## 177      boot             salinity     28      4         0            0
## 178      boot             survival     14      2         0            0
## 179      boot                  tau     60      2         0            0
## 180      boot                 tuna     64      1         0            0
## 181      boot                urine     79      7         1            0
## 183   carData                Adler    108      3         1            0
## 184   carData            AMSsurvey     24      5         2            0
## 185   carData               Angell     43      4         0            0
## 186   carData             Anscombe     51      4         0            0
## 188   carData              Baumann     66      6         0            0
## 190   carData                 Bfox     30      6         0            0
## 192   carData                 Burt     27      3         0            0
## 193   carData               CanPop     16      2         0            0
## 196   carData               Chirot     32      5         0            0
## 199   carData            DavisThin    191      7         0            0
## 201   carData               Duncan     45      4         0            0
## 202   carData             Ericksen     66      9         1            0
##     no_factor no_logical no_numeric
## 2           0          0          2
## 4           0          0          5
## 5           0          0          2
## 8           0          0          5
## 9           0          0          3
## 10          2          0          7
## 12          0          0          3
## 27          0          0          8
## 28          0          0          8
## 30          0          0          4
## 31          0          0          2
## 36          0          0          2
## 38          3          0          7
## 39          0          0          6
## 47          3          0          6
## 48          0          0          9
## 49          0          0          4
## 50          0          0          9
## 55          5          0         11
## 56          0          0          2
## 57          0          0          2
## 60          1          0          7
## 61          3          0          7
## 65          0          0          6
## 66          1          0          1
## 67          0          0          2
## 68          3          0          2
## 71          2          0          2
## 77          3          0          2
## 78          0          0          3
## 80          6          0          3
## 82          0          0          2
## 85          0          0          3
## 86          0          0          7
## 88          0          0          2
## 89          0          0          2
## 90          2          0          4
## 91          0          0          3
## 92          0          0          2
## 93          0          0          2
## 94          0          0          3
## 95          0          0          6
## 96          0          0         10
## 97          0          0          4
## 98          0          0          3
## 100         0          0          7
## 103         0          0          3
## 104         0          0          2
## 108         0          0          3
## 109         2          0          2
## 110         3          0          3
## 111         4          0          2
## 112         1          0          2
## 113         1          0          2
## 114         2          0          2
## 115         1          0          2
## 116         1          0          2
## 117         0          0          2
## 120         0          0          2
## 122         4          0          0
## 123         1          0          3
## 124         7          0          7
## 126         1          0          2
## 127         0          0          3
## 128         1          0          5
## 130         2          0          1
## 131         1          0          2
## 132         0          0          2
## 134         0          0          2
## 136         0          0          1
## 137         0          0          1
## 139         0          0          3
## 140         0          0          4
## 141         0          0          2
## 143         1          0          4
## 144         0          0          2
## 145         2          0          3
## 146         0          0          1
## 147         1          0          2
## 148         0          0          2
## 149         0          0          2
## 151         0          0          2
## 152         0          0          2
## 153         0          0          2
## 154         0          0          2
## 155         0          0          1
## 156         0          0          1
## 157         0          0          2
## 158         0          0          3
## 159         0          0          2
## 160         0          0          3
## 161         0          0          4
## 162         1          0          1
## 163         1          0          1
## 164         0          0          3
## 165         0          0          1
## 168         0          0          4
## 170         0          0          5
## 171         0          0          7
## 172         0          0         11
## 174         2          0          1
## 175         0          0          2
## 176         0          0          3
## 177         0          0          4
## 178         0          0          2
## 179         1          0          1
## 180         0          0          1
## 181         0          0          7
## 183         2          0          1
## 184         3          0          2
## 185         1          0          3
## 186         0          0          4
## 188         1          0          5
## 190         0          0          6
## 192         1          0          2
## 193         0          0          2
## 196         0          0          5
## 199         0          0          7
## 201         1          0          3
## 202         1          0          8

##7. BONUS – place the original .csv in a github file and have R read from the link. This will be a very ## useful skill as you progress in your data science education and career.

library (readr)
urlfile="https://raw.githubusercontent.com/tonyCUNY/test/main/new_datasets_3.csv"
data <- read_csv(url(urlfile))
## Rows: 1124 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Pack_Name, item
## dbl (7): n_rows, n_cols, no_binary, no_character, no_factor, no_logical, no_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 9
##   Pack_Name item          n_rows n_cols no_bin…¹ no_ch…² no_fa…³ no_lo…⁴ no_nu…⁵
##   <chr>     <chr>          <dbl>  <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 GER       ArgentinaCPI      80      2        0       0       0       0       2
## 2 GER       BenderlyZwick     31      5        0       0       0       0       5
## 3 GER       BondYield         60      2        0       0       0       0       2
## 4 GER       ChinaIncome       37      5        0       0       0       0       5
## 5 GER       CigarettesB       46      3        0       0       0       0       3
## 6 GER       CigarettesSW      96      9        2       0       2       0       7
## # … with abbreviated variable names ¹​no_binary, ²​no_character, ³​no_factor,
## #   ⁴​no_logical, ⁵​no_numeric