#Reading csv file in my computer
getwd
## function ()
## .Internal(getwd())
## <bytecode: 0x0000000010dbb308>
## <environment: namespace:base>
setwd("C:\\DataScienceAssignments\\RAssignments")
Rdatasets <- read.csv(file.choose())
nrow(Rdatasets)
## [1] 1340
ncol(Rdatasets)
## [1] 12
str(Rdatasets)
## 'data.frame': 1340 obs. of 12 variables:
## $ Package : Factor w/ 43 levels "boot","carData",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Item : Factor w/ 1297 levels "a10","abbey",..: 11 19 22 23 43 44 108 120 158 161 ...
## $ Title : Factor w/ 1234 levels "'colors()' in Luv space",..: 731 359 466 466 187 924 129 855 1019 1001 ...
## $ Rows : int 60 570 12 24 8437 23 100 49 823 10 ...
## $ Cols : int 3 6 1 1 4 3 4 2 3 5 ...
## $ n_binary : int 0 1 0 0 1 2 2 0 0 1 ...
## $ n_character: int 1 0 0 0 0 0 0 0 0 0 ...
## $ n_factor : int 0 0 0 0 0 0 0 0 0 1 ...
## $ n_logical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ n_numeric : int 2 6 1 1 4 3 4 2 3 4 ...
## $ CSV : Factor w/ 1340 levels "https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Doc : Factor w/ 1340 levels "https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html",..: 1 2 3 4 5 6 7 8 9 10 ...
#1.
#Using summary and head function
summary(Rdatasets)
## Package Item
## Stat2Data:211 lung : 3
## Ecdat :130 aids : 2
## DAAG :121 channing: 2
## MASS : 87 Cigar : 2
## datasets : 84 cities : 2
## carData : 62 Clothing: 2
## (Other) :645 (Other) :1327
## Title
## Labour Training Evaluation Data : 11
## Seven data sets showing a bifactor solution. : 9
## Individual Preferences Over Immigration Policy : 6
## John Snow's Map and Data on the 1854 London Cholera Outbreak : 5
## Rain, wavesurge, portpirie and nidd datasets. : 4
## Australian and Related Historical Annual Climate Data, by region: 3
## (Other) :1302
## Rows Cols n_binary n_character
## Min. : 0.0 Min. : 1.00 Min. : 0.000 Min. : 0.0
## 1st Qu.: 30.0 1st Qu.: 3.00 1st Qu.: 0.000 1st Qu.: 0.0
## Median : 91.0 Median : 5.00 Median : 0.000 Median : 0.0
## Mean : 1666.7 Mean : 14.83 Mean : 2.092 Mean : 0.1
## 3rd Qu.: 435.5 3rd Qu.: 9.00 3rd Qu.: 2.000 3rd Qu.: 0.0
## Max. :372864.0 Max. :6831.00 Max. :624.000 Max. :16.0
##
## n_factor n_logical n_numeric
## Min. : 0.00 Min. :0.00000 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:0.00000 1st Qu.: 2.0
## Median : 0.00 Median :0.00000 Median : 4.0
## Mean : 1.19 Mean :0.01343 Mean : 13.5
## 3rd Qu.: 2.00 3rd Qu.:0.00000 3rd Qu.: 7.0
## Max. :64.00 Max. :4.00000 Max. :6830.0
##
## CSV
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv: 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv : 1
## (Other) :1334
## Doc
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html: 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html : 1
## (Other) :1334
head(Rdatasets)
## Package Item Title Rows
## 1 boot acme Monthly Excess Returns 60
## 2 boot aids Delay in AIDS Reporting in England and Wales 570
## 3 boot aircondit Failures of Air-conditioning Equipment 12
## 4 boot aircondit7 Failures of Air-conditioning Equipment 24
## 5 boot amis Car Speeding and Warning Signs 8437
## 6 boot aml Remission Times for Acute Myelogenous Leukaemia 23
## Cols n_binary n_character n_factor n_logical n_numeric
## 1 3 0 1 0 0 2
## 2 6 1 0 0 0 6
## 3 1 0 0 0 0 1
## 4 1 0 0 0 0 1
## 5 4 1 0 0 0 4
## 6 3 2 0 0 0 3
## CSV
## 1 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv
## 2 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv
## 3 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv
## 5 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv
## 6 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv
## Doc
## 1 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html
## 2 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html
## 3 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html
## 5 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html
## 6 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html
#displaying the mean and the median
mean.rows <- mean(Rdatasets$Rows)
median.row <- median(Rdatasets$Rows)
round(mean.rows,1)
## [1] 1666.7
median.row
## [1] 91
mean.numeric <- mean(Rdatasets$n_numeric)
median.numeric <- median(Rdatasets$n_numeric)
round(mean.numeric,1)
## [1] 13.5
median.numeric
## [1] 4
#2. Creating a dataframe subset of the original
mysubdata <- data.frame(Item =Rdatasets$Item, description = Rdatasets$Title, num.row = Rdatasets$Rows, num.col = Rdatasets$Cols,n.numeric = Rdatasets$n_numeric)[1:100,]
head(mysubdata)
## Item description num.row
## 1 acme Monthly Excess Returns 60
## 2 aids Delay in AIDS Reporting in England and Wales 570
## 3 aircondit Failures of Air-conditioning Equipment 12
## 4 aircondit7 Failures of Air-conditioning Equipment 24
## 5 amis Car Speeding and Warning Signs 8437
## 6 aml Remission Times for Acute Myelogenous Leukaemia 23
## num.col n.numeric
## 1 3 2
## 2 6 6
## 3 1 1
## 4 1 1
## 5 4 4
## 6 3 3
summary(mysubdata)
## Item
## acme : 1
## Adler : 1
## aids : 1
## aircondit : 1
## aircondit7: 1
## amis : 1
## (Other) :94
## description
## Acceleration Due to Gravity : 2
## Failures of Air-conditioning Equipment : 2
## Population of U.S. Cities : 2
## 2011 Canadian National Election Study, With Attitude Toward Abortion: 1
## Agricultural Production in Mazulu Village : 1
## American Math Society Survey Data : 1
## (Other) :91
## num.row num.col n.numeric
## Min. : 7.00 Min. : 1.00 Min. : 1.00
## 1st Qu.: 27.75 1st Qu.: 2.00 1st Qu.: 2.00
## Median : 60.00 Median : 4.00 Median : 3.00
## Mean : 1267.83 Mean : 5.32 Mean : 3.73
## 3rd Qu.: 215.75 3rd Qu.: 6.25 3rd Qu.: 5.00
## Max. :51920.00 Max. :62.00 Max. :15.00
##
#3 adding a new column in my dataframe
mysubdata$rc.product <- mysubdata$num.row *mysubdata$num.col
head(mysubdata)
## Item description num.row
## 1 acme Monthly Excess Returns 60
## 2 aids Delay in AIDS Reporting in England and Wales 570
## 3 aircondit Failures of Air-conditioning Equipment 12
## 4 aircondit7 Failures of Air-conditioning Equipment 24
## 5 amis Car Speeding and Warning Signs 8437
## 6 aml Remission Times for Acute Myelogenous Leukaemia 23
## num.col n.numeric rc.product
## 1 3 2 180
## 2 6 6 3420
## 3 1 1 12
## 4 1 1 24
## 5 4 4 33748
## 6 3 3 69
#4 Looking at the summary of my dataframe
summary(mysubdata)
## Item
## acme : 1
## Adler : 1
## aids : 1
## aircondit : 1
## aircondit7: 1
## amis : 1
## (Other) :94
## description
## Acceleration Due to Gravity : 2
## Failures of Air-conditioning Equipment : 2
## Population of U.S. Cities : 2
## 2011 Canadian National Election Study, With Attitude Toward Abortion: 1
## Agricultural Production in Mazulu Village : 1
## American Math Society Survey Data : 1
## (Other) :91
## num.row num.col n.numeric rc.product
## Min. : 7.00 Min. : 1.00 Min. : 1.00 Min. : 12.0
## 1st Qu.: 27.75 1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 81.0
## Median : 60.00 Median : 4.00 Median : 3.00 Median : 283.5
## Mean : 1267.83 Mean : 5.32 Mean : 3.73 Mean : 12514.4
## 3rd Qu.: 215.75 3rd Qu.: 6.25 3rd Qu.: 5.00 3rd Qu.: 994.0
## Max. :51920.00 Max. :62.00 Max. :15.00 Max. :726880.0
##
#displaying the mean and the median
mean.rows <- mean(mysubdata$num.row,na.rm = T)
median.rows <- median(mysubdata$num.row, na.rm = T)
round(mean.rows,1)
## [1] 1267.8
median.rows
## [1] 60
#### The mean and median of Rowa from the whole data are 3/2 greater than
#### the mean and median from the subset
mean.numeric <- mean(mysubdata$n.numeric)
median.numeric <- median(mysubdata$n.numeric)
round(mean.numeric,1)
## [1] 3.7
median.numeric
## [1] 3
### The subset of n_numeric column has it mean and median less than
### the ones of the original data
#5 Using factor
mysubdata$n.numeric <- factor(mysubdata$n.numeric)
str(mysubdata)
## 'data.frame': 100 obs. of 6 variables:
## $ Item : Factor w/ 1297 levels "a10","abbey",..: 11 19 22 23 43 44 108 120 158 161 ...
## $ description: Factor w/ 1234 levels "'colors()' in Luv space",..: 731 359 466 466 187 924 129 855 1019 1001 ...
## $ num.row : int 60 570 12 24 8437 23 100 49 823 10 ...
## $ num.col : int 3 6 1 1 4 3 4 2 3 5 ...
## $ n.numeric : Factor w/ 11 levels "1","2","3","4",..: 2 6 1 1 4 3 4 2 3 4 ...
## $ rc.product : int 180 3420 12 24 33748 69 400 98 2469 50 ...
levels(mysubdata$n.numeric) <-c("Null","weakest","weaker", "weak", "fair", "average", "acceptable", "strong", "stronger", "strongest", "perfect")
levels(mysubdata$n.numeric)
## [1] "Null" "weakest" "weaker" "weak" "fair"
## [6] "average" "acceptable" "strong" "stronger" "strongest"
## [11] "perfect"
#6 Displaying my dataframe with 5 rows
head(mysubdata)
## Item description num.row
## 1 acme Monthly Excess Returns 60
## 2 aids Delay in AIDS Reporting in England and Wales 570
## 3 aircondit Failures of Air-conditioning Equipment 12
## 4 aircondit7 Failures of Air-conditioning Equipment 24
## 5 amis Car Speeding and Warning Signs 8437
## 6 aml Remission Times for Acute Myelogenous Leukaemia 23
## num.col n.numeric rc.product
## 1 3 weakest 180
## 2 6 average 3420
## 3 1 Null 12
## 4 1 Null 24
## 5 4 weak 33748
## 6 3 weaker 69
#or
mysubdata[1:5,]
## Item description num.row num.col
## 1 acme Monthly Excess Returns 60 3
## 2 aids Delay in AIDS Reporting in England and Wales 570 6
## 3 aircondit Failures of Air-conditioning Equipment 12 1
## 4 aircondit7 Failures of Air-conditioning Equipment 24 1
## 5 amis Car Speeding and Warning Signs 8437 4
## n.numeric rc.product
## 1 weakest 180
## 2 average 3420
## 3 Null 12
## 4 Null 24
## 5 weak 33748
#7 Get dataset file from github
library(readr)
gitDatasets <- read.csv("https://raw.githubusercontent.com/AlainKuiete/SummerBridge/master/datasets.csv")
summary(gitDatasets)
## Package Item
## Stat2Data:211 lung : 3
## Ecdat :130 aids : 2
## DAAG :121 channing: 2
## MASS : 87 Cigar : 2
## datasets : 84 cities : 2
## carData : 62 Clothing: 2
## (Other) :645 (Other) :1327
## Title
## Labour Training Evaluation Data : 11
## Seven data sets showing a bifactor solution. : 9
## Individual Preferences Over Immigration Policy : 6
## John Snow's Map and Data on the 1854 London Cholera Outbreak : 5
## Rain, wavesurge, portpirie and nidd datasets. : 4
## Australian and Related Historical Annual Climate Data, by region: 3
## (Other) :1302
## Rows Cols n_binary n_character
## Min. : 0.0 Min. : 1.00 Min. : 0.000 Min. : 0.0
## 1st Qu.: 30.0 1st Qu.: 3.00 1st Qu.: 0.000 1st Qu.: 0.0
## Median : 91.0 Median : 5.00 Median : 0.000 Median : 0.0
## Mean : 1666.7 Mean : 14.83 Mean : 2.092 Mean : 0.1
## 3rd Qu.: 435.5 3rd Qu.: 9.00 3rd Qu.: 2.000 3rd Qu.: 0.0
## Max. :372864.0 Max. :6831.00 Max. :624.000 Max. :16.0
##
## n_factor n_logical n_numeric
## Min. : 0.00 Min. :0.00000 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:0.00000 1st Qu.: 2.0
## Median : 0.00 Median :0.00000 Median : 4.0
## Mean : 1.19 Mean :0.01343 Mean : 13.5
## 3rd Qu.: 2.00 3rd Qu.:0.00000 3rd Qu.: 7.0
## Max. :64.00 Max. :4.00000 Max. :6830.0
##
## CSV
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv: 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv : 1
## (Other) :1334
## Doc
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html: 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html : 1
## https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html : 1
## (Other) :1334