Here is a list of data sets: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list)

1 Prerequistes : Read from CSV file

# Read CSV into R from local
myWorkingDir <- getwd()
#myWorkingDir
mySourceFile <- paste(myWorkingDir,"/datasets.csv", sep = "")
#mySourceFile
myLocalData <- read.csv(file=mySourceFile, header=TRUE, sep=",")
head(myLocalData)
##   Package       Item                                           Title Rows
## 1    boot       acme                          Monthly Excess Returns   60
## 2    boot       aids    Delay in AIDS Reporting in England and Wales  570
## 3    boot  aircondit          Failures of Air-conditioning Equipment   12
## 4    boot aircondit7          Failures of Air-conditioning Equipment   24
## 5    boot       amis                  Car Speeding and Warning Signs 8437
## 6    boot        aml Remission Times for Acute Myelogenous Leukaemia   23
##   Cols has_logical has_binary has_numeric has_character
## 1    3       FALSE      FALSE        TRUE          TRUE
## 2    6       FALSE       TRUE        TRUE         FALSE
## 3    1       FALSE      FALSE        TRUE         FALSE
## 4    1       FALSE      FALSE        TRUE         FALSE
## 5    4       FALSE       TRUE        TRUE         FALSE
## 6    3       FALSE       TRUE        TRUE         FALSE
##                                                                                  CSV
## 1       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv
## 2       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv
## 3  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv
## 5       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv
## 6        https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv
##                                                                                   Doc
## 1       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html
## 2       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html
## 3  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html
## 5       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html
## 6        https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html
head(myLocalData$Item)
## [1] acme       aids       aircondit  aircondit7 amis       aml       
## 1200 Levels: a10 abbey ability ability.cov absentee accdeaths ... Zelig.url
install.packages("readr",repos = "http://cran.us.r-project.org")
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Debabrata\AppData\Local\Temp\Rtmp8WN1qp\downloaded_packages

2 (1) Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

summary(myLocalData)
##       Package          Item     
##  Ecdat    :130   lung    :   3  
##  DAAG     :121   aids    :   2  
##  Stat2Data:119   channing:   2  
##  MASS     : 87   Cigar   :   2  
##  datasets : 84   cities  :   2  
##  carData  : 59   Clothing:   2  
##  (Other)  :643   (Other) :1230  
##                                                               Title     
##  Labour Training Evaluation Data                                 :  11  
##  Seven data sets showing a bifactor solution.                    :   9  
##  Individual Preferences Over Immigration Policy                  :   6  
##  John Snow's Map and Data on the 1854 London Cholera Outbreak    :   5  
##  Rain, wavesurge, portpirie and nidd datasets.                   :   4  
##  Australian and Related Historical Annual Climate Data, by region:   3  
##  (Other)                                                         :1205  
##       Rows             Cols         has_logical     has_binary     
##  Min.   :     0   Min.   :   1.00   Mode :logical   Mode :logical  
##  1st Qu.:    30   1st Qu.:   3.00   FALSE:1233      FALSE:717      
##  Median :    90   Median :   5.00   TRUE :10        TRUE :526      
##  Mean   :  1576   Mean   :  15.46                                  
##  3rd Qu.:   451   3rd Qu.:   9.00                                  
##  Max.   :372864   Max.   :6831.00                                  
##                                                                    
##  has_numeric     has_character  
##  Mode :logical   Mode :logical  
##  FALSE:329       FALSE:1190     
##  TRUE :914       TRUE :53       
##                                 
##                                 
##                                 
##                                 
##                                                                                  CSV      
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv:   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv       :   1  
##  (Other)                                                                           :1237  
##                                                                                   Doc      
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html:   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html       :   1  
##  (Other)                                                                            :1237
mean(myLocalData$`Rows`)
## [1] 1575.697
mean(myLocalData$Rows,na.rm=T) #'na.rm=T' is to remove missing values before calculating the mean
## [1] 1575.697
mean(myLocalData[,'Rows'],na.rm=T) #'na.rm=T' is to remove missing values before calculating the mean
## [1] 1575.697
mean(myLocalData[["Rows"]])
## [1] 1575.697
mean(myLocalData$`Cols`)
## [1] 15.465
median(myLocalData$Cols,na.rm=TRUE) #'na.rm=TRUE' is to remove missing values before calculating the mean
## [1] 5
median(myLocalData[,'Cols'],na.rm=TRUE) #'na.rm=TRUE' is to remove missing values before calculating the mean
## [1] 5
median(myLocalData[["Cols"]])
## [1] 5

3 (2) Create a new data frame with a subset of the columns and rows. Make sure to rename it

myDataFrame <- myLocalData[2:10, c("Item","Title","Rows","Cols")]
colnames(myDataFrame) <- c("Product","Type","Records#","Attributes#")
myDataFrame
##       Product                                            Type Records#
## 2        aids    Delay in AIDS Reporting in England and Wales      570
## 3   aircondit          Failures of Air-conditioning Equipment       12
## 4  aircondit7          Failures of Air-conditioning Equipment       24
## 5        amis                  Car Speeding and Warning Signs     8437
## 6         aml Remission Times for Acute Myelogenous Leukaemia       23
## 7      beaver                    Beaver Body Temperature Data      100
## 8     bigcity                       Population of U.S. Cities       49
## 9    brambles               Spatial Location of Bramble Canes      823
## 10    breslow                    Smoking Deaths Among Doctors       10
##    Attributes#
## 2            6
## 3            1
## 4            1
## 5            4
## 6            3
## 7            4
## 8            2
## 9            3
## 10           5

4 (3) Create new column names for the new data frame.

colnames(myDataFrame) <- c("ProductName","TypeName","RecordsNumber","AttributesNumber")
myDataFrame
##    ProductName                                        TypeName
## 2         aids    Delay in AIDS Reporting in England and Wales
## 3    aircondit          Failures of Air-conditioning Equipment
## 4   aircondit7          Failures of Air-conditioning Equipment
## 5         amis                  Car Speeding and Warning Signs
## 6          aml Remission Times for Acute Myelogenous Leukaemia
## 7       beaver                    Beaver Body Temperature Data
## 8      bigcity                       Population of U.S. Cities
## 9     brambles               Spatial Location of Bramble Canes
## 10     breslow                    Smoking Deaths Among Doctors
##    RecordsNumber AttributesNumber
## 2            570                6
## 3             12                1
## 4             24                1
## 5           8437                4
## 6             23                3
## 7            100                4
## 8             49                2
## 9            823                3
## 10            10                5

5 (4) Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

summary(myDataFrame)
##      ProductName                                            TypeName
##  aids      :1    Failures of Air-conditioning Equipment         :2  
##  aircondit :1    Beaver Body Temperature Data                   :1  
##  aircondit7:1    Car Speeding and Warning Signs                 :1  
##  amis      :1    Delay in AIDS Reporting in England and Wales   :1  
##  aml       :1    Population of U.S. Cities                      :1  
##  beaver    :1    Remission Times for Acute Myelogenous Leukaemia:1  
##  (Other)   :3    (Other)                                        :2  
##  RecordsNumber  AttributesNumber
##  Min.   :  10   Min.   :1.000   
##  1st Qu.:  23   1st Qu.:2.000   
##  Median :  49   Median :3.000   
##  Mean   :1116   Mean   :3.222   
##  3rd Qu.: 570   3rd Qu.:4.000   
##  Max.   :8437   Max.   :6.000   
## 
mean(myDataFrame$`RecordsNumber`)
## [1] 1116.444
mean(myDataFrame$RecordsNumber,na.rm=T) #'na.rm=T' is to remove missing values before calculating the mean
## [1] 1116.444
mean(myDataFrame[,'RecordsNumber'],na.rm=T) #'na.rm=T' is to remove missing values before calculating the mean
## [1] 1116.444
mean(myDataFrame[["RecordsNumber"]])
## [1] 1116.444
median(myDataFrame$`AttributesNumber`)
## [1] 3
median(myDataFrame$AttributesNumber,na.rm=TRUE) #'na.rm=TRUE' is to remove missing values before calculating the mean
## [1] 3
median(myDataFrame[,'AttributesNumber'],na.rm=TRUE) #'na.rm=TRUE' is to remove missing values before calculating the mean
## [1] 3
median(myDataFrame[["AttributesNumber"]])
## [1] 3

6 (5) For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.

myDataFrame$TypeName <- gsub('Failures', 'Errors', myDataFrame$TypeName)
myDataFrame
##    ProductName                                        TypeName
## 2         aids    Delay in AIDS Reporting in England and Wales
## 3    aircondit            Errors of Air-conditioning Equipment
## 4   aircondit7            Errors of Air-conditioning Equipment
## 5         amis                  Car Speeding and Warning Signs
## 6          aml Remission Times for Acute Myelogenous Leukaemia
## 7       beaver                    Beaver Body Temperature Data
## 8      bigcity                       Population of U.S. Cities
## 9     brambles               Spatial Location of Bramble Canes
## 10     breslow                    Smoking Deaths Among Doctors
##    RecordsNumber AttributesNumber
## 2            570                6
## 3             12                1
## 4             24                1
## 5           8437                4
## 6             23                3
## 7            100                4
## 8             49                2
## 9            823                3
## 10            10                5

7 (6) Display enough rows to see examples of all of steps 1-5 above

myDataFrame
##    ProductName                                        TypeName
## 2         aids    Delay in AIDS Reporting in England and Wales
## 3    aircondit            Errors of Air-conditioning Equipment
## 4   aircondit7            Errors of Air-conditioning Equipment
## 5         amis                  Car Speeding and Warning Signs
## 6          aml Remission Times for Acute Myelogenous Leukaemia
## 7       beaver                    Beaver Body Temperature Data
## 8      bigcity                       Population of U.S. Cities
## 9     brambles               Spatial Location of Bramble Canes
## 10     breslow                    Smoking Deaths Among Doctors
##    RecordsNumber AttributesNumber
## 2            570                6
## 3             12                1
## 4             24                1
## 5           8437                4
## 6             23                3
## 7            100                4
## 8             49                2
## 9            823                3
## 10            10                5

8 (7) BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

# Read CSV into R from GitHub
install.packages("RCurl",repos = "http://cran.us.r-project.org")
## package 'RCurl' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Debabrata\AppData\Local\Temp\Rtmp8WN1qp\downloaded_packages
library(RCurl)
## Loading required package: bitops
myGitHubURL<-"https://raw.githubusercontent.com/destination4debabrata/CUNY-Assignments/master/Week%202%20%5BJan%202%20-%20Jan%209%5D/R/datasets.csv"
myGitHubDataURL <- getURL(myGitHubURL)
myGitHubData <- read.csv(text = myGitHubDataURL)

#myGitHubData
head(myGitHubData)
##   Package       Item                                           Title Rows
## 1    boot       acme                          Monthly Excess Returns   60
## 2    boot       aids    Delay in AIDS Reporting in England and Wales  570
## 3    boot  aircondit          Failures of Air-conditioning Equipment   12
## 4    boot aircondit7          Failures of Air-conditioning Equipment   24
## 5    boot       amis                  Car Speeding and Warning Signs 8437
## 6    boot        aml Remission Times for Acute Myelogenous Leukaemia   23
##   Cols has_logical has_binary has_numeric has_character
## 1    3       FALSE      FALSE        TRUE          TRUE
## 2    6       FALSE       TRUE        TRUE         FALSE
## 3    1       FALSE      FALSE        TRUE         FALSE
## 4    1       FALSE      FALSE        TRUE         FALSE
## 5    4       FALSE       TRUE        TRUE         FALSE
## 6    3       FALSE       TRUE        TRUE         FALSE
##                                                                                  CSV
## 1       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv
## 2       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv
## 3  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv
## 5       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv
## 6        https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv
##                                                                                   Doc
## 1       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html
## 2       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html
## 3  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html
## 5       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html
## 6        https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html
#MyGitHubData$Item
head(myGitHubData$Item)
## [1] acme       aids       aircondit  aircondit7 amis       aml       
## 1200 Levels: a10 abbey ability ability.cov absentee accdeaths ... Zelig.url