Summer Bridge R Assignment Week 2

#Reading csv file in my computer
getwd

## function () 
## .Internal(getwd())
## <bytecode: 0x0000000010dbb308>
## <environment: namespace:base>

setwd("C:\\DataScienceAssignments\\RAssignments")

Rdatasets <- read.csv(file.choose())
nrow(Rdatasets)

## [1] 1340

ncol(Rdatasets)

## [1] 12

str(Rdatasets)

## 'data.frame':    1340 obs. of  12 variables:
##  $ Package    : Factor w/ 43 levels "boot","carData",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Item       : Factor w/ 1297 levels "a10","abbey",..: 11 19 22 23 43 44 108 120 158 161 ...
##  $ Title      : Factor w/ 1234 levels "'colors()' in Luv space",..: 731 359 466 466 187 924 129 855 1019 1001 ...
##  $ Rows       : int  60 570 12 24 8437 23 100 49 823 10 ...
##  $ Cols       : int  3 6 1 1 4 3 4 2 3 5 ...
##  $ n_binary   : int  0 1 0 0 1 2 2 0 0 1 ...
##  $ n_character: int  1 0 0 0 0 0 0 0 0 0 ...
##  $ n_factor   : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ n_logical  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ n_numeric  : int  2 6 1 1 4 3 4 2 3 4 ...
##  $ CSV        : Factor w/ 1340 levels "https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Doc        : Factor w/ 1340 levels "https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html",..: 1 2 3 4 5 6 7 8 9 10 ...

#1.

#Using summary and head function
summary(Rdatasets)

##       Package          Item     
##  Stat2Data:211   lung    :   3  
##  Ecdat    :130   aids    :   2  
##  DAAG     :121   channing:   2  
##  MASS     : 87   Cigar   :   2  
##  datasets : 84   cities  :   2  
##  carData  : 62   Clothing:   2  
##  (Other)  :645   (Other) :1327  
##                                                               Title     
##  Labour Training Evaluation Data                                 :  11  
##  Seven data sets showing a bifactor solution.                    :   9  
##  Individual Preferences Over Immigration Policy                  :   6  
##  John Snow's Map and Data on the 1854 London Cholera Outbreak    :   5  
##  Rain, wavesurge, portpirie and nidd datasets.                   :   4  
##  Australian and Related Historical Annual Climate Data, by region:   3  
##  (Other)                                                         :1302  
##       Rows               Cols            n_binary        n_character  
##  Min.   :     0.0   Min.   :   1.00   Min.   :  0.000   Min.   : 0.0  
##  1st Qu.:    30.0   1st Qu.:   3.00   1st Qu.:  0.000   1st Qu.: 0.0  
##  Median :    91.0   Median :   5.00   Median :  0.000   Median : 0.0  
##  Mean   :  1666.7   Mean   :  14.83   Mean   :  2.092   Mean   : 0.1  
##  3rd Qu.:   435.5   3rd Qu.:   9.00   3rd Qu.:  2.000   3rd Qu.: 0.0  
##  Max.   :372864.0   Max.   :6831.00   Max.   :624.000   Max.   :16.0  
##                                                                       
##     n_factor       n_logical         n_numeric     
##  Min.   : 0.00   Min.   :0.00000   Min.   :   0.0  
##  1st Qu.: 0.00   1st Qu.:0.00000   1st Qu.:   2.0  
##  Median : 0.00   Median :0.00000   Median :   4.0  
##  Mean   : 1.19   Mean   :0.01343   Mean   :  13.5  
##  3rd Qu.: 2.00   3rd Qu.:0.00000   3rd Qu.:   7.0  
##  Max.   :64.00   Max.   :4.00000   Max.   :6830.0  
##                                                    
##                                                                                  CSV      
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv:   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv       :   1  
##  (Other)                                                                           :1334  
##                                                                                   Doc      
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html:   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html       :   1  
##  (Other)                                                                            :1334

head(Rdatasets)

##   Package       Item                                           Title Rows
## 1    boot       acme                          Monthly Excess Returns   60
## 2    boot       aids    Delay in AIDS Reporting in England and Wales  570
## 3    boot  aircondit          Failures of Air-conditioning Equipment   12
## 4    boot aircondit7          Failures of Air-conditioning Equipment   24
## 5    boot       amis                  Car Speeding and Warning Signs 8437
## 6    boot        aml Remission Times for Acute Myelogenous Leukaemia   23
##   Cols n_binary n_character n_factor n_logical n_numeric
## 1    3        0           1        0         0         2
## 2    6        1           0        0         0         6
## 3    1        0           0        0         0         1
## 4    1        0           0        0         0         1
## 5    4        1           0        0         0         4
## 6    3        2           0        0         0         3
##                                                                                  CSV
## 1       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv
## 2       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv
## 3  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv
## 5       https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv
## 6        https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv
##                                                                                   Doc
## 1       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html
## 2       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html
## 3  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html
## 4 https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html
## 5       https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html
## 6        https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html

#displaying the mean and the median
mean.rows <- mean(Rdatasets$Rows)
median.row <- median(Rdatasets$Rows)
round(mean.rows,1)

## [1] 1666.7

median.row

## [1] 91

mean.numeric <- mean(Rdatasets$n_numeric)
median.numeric <- median(Rdatasets$n_numeric)
round(mean.numeric,1)

## [1] 13.5

median.numeric

## [1] 4

#2. Creating a dataframe subset of the original
mysubdata <- data.frame(Item =Rdatasets$Item, description = Rdatasets$Title, num.row = Rdatasets$Rows, num.col = Rdatasets$Cols,n.numeric = Rdatasets$n_numeric)[1:100,]

head(mysubdata)

##         Item                                     description num.row
## 1       acme                          Monthly Excess Returns      60
## 2       aids    Delay in AIDS Reporting in England and Wales     570
## 3  aircondit          Failures of Air-conditioning Equipment      12
## 4 aircondit7          Failures of Air-conditioning Equipment      24
## 5       amis                  Car Speeding and Warning Signs    8437
## 6        aml Remission Times for Acute Myelogenous Leukaemia      23
##   num.col n.numeric
## 1       3         2
## 2       6         6
## 3       1         1
## 4       1         1
## 5       4         4
## 6       3         3

summary(mysubdata)

##          Item   
##  acme      : 1  
##  Adler     : 1  
##  aids      : 1  
##  aircondit : 1  
##  aircondit7: 1  
##  amis      : 1  
##  (Other)   :94  
##                                                                description
##  Acceleration Due to Gravity                                         : 2  
##  Failures of Air-conditioning Equipment                              : 2  
##  Population of U.S. Cities                                           : 2  
##  2011 Canadian National Election Study, With Attitude Toward Abortion: 1  
##  Agricultural Production in Mazulu Village                           : 1  
##  American Math Society Survey Data                                   : 1  
##  (Other)                                                             :91  
##     num.row            num.col        n.numeric    
##  Min.   :    7.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.:   27.75   1st Qu.: 2.00   1st Qu.: 2.00  
##  Median :   60.00   Median : 4.00   Median : 3.00  
##  Mean   : 1267.83   Mean   : 5.32   Mean   : 3.73  
##  3rd Qu.:  215.75   3rd Qu.: 6.25   3rd Qu.: 5.00  
##  Max.   :51920.00   Max.   :62.00   Max.   :15.00  
##

#3 adding a new column in my dataframe
mysubdata$rc.product <- mysubdata$num.row *mysubdata$num.col
head(mysubdata)

##         Item                                     description num.row
## 1       acme                          Monthly Excess Returns      60
## 2       aids    Delay in AIDS Reporting in England and Wales     570
## 3  aircondit          Failures of Air-conditioning Equipment      12
## 4 aircondit7          Failures of Air-conditioning Equipment      24
## 5       amis                  Car Speeding and Warning Signs    8437
## 6        aml Remission Times for Acute Myelogenous Leukaemia      23
##   num.col n.numeric rc.product
## 1       3         2        180
## 2       6         6       3420
## 3       1         1         12
## 4       1         1         24
## 5       4         4      33748
## 6       3         3         69

#4 Looking at the summary of my dataframe
summary(mysubdata)

##          Item   
##  acme      : 1  
##  Adler     : 1  
##  aids      : 1  
##  aircondit : 1  
##  aircondit7: 1  
##  amis      : 1  
##  (Other)   :94  
##                                                                description
##  Acceleration Due to Gravity                                         : 2  
##  Failures of Air-conditioning Equipment                              : 2  
##  Population of U.S. Cities                                           : 2  
##  2011 Canadian National Election Study, With Attitude Toward Abortion: 1  
##  Agricultural Production in Mazulu Village                           : 1  
##  American Math Society Survey Data                                   : 1  
##  (Other)                                                             :91  
##     num.row            num.col        n.numeric       rc.product      
##  Min.   :    7.00   Min.   : 1.00   Min.   : 1.00   Min.   :    12.0  
##  1st Qu.:   27.75   1st Qu.: 2.00   1st Qu.: 2.00   1st Qu.:    81.0  
##  Median :   60.00   Median : 4.00   Median : 3.00   Median :   283.5  
##  Mean   : 1267.83   Mean   : 5.32   Mean   : 3.73   Mean   : 12514.4  
##  3rd Qu.:  215.75   3rd Qu.: 6.25   3rd Qu.: 5.00   3rd Qu.:   994.0  
##  Max.   :51920.00   Max.   :62.00   Max.   :15.00   Max.   :726880.0  
##

#displaying the mean and the median
mean.rows <- mean(mysubdata$num.row,na.rm = T)
median.rows <- median(mysubdata$num.row, na.rm = T)
round(mean.rows,1)

## [1] 1267.8

median.rows

## [1] 60

#### The mean and median of Rowa from the whole data are 3/2 greater than
#### the mean and median from the subset


mean.numeric <- mean(mysubdata$n.numeric)
median.numeric <- median(mysubdata$n.numeric)
round(mean.numeric,1)

## [1] 3.7

median.numeric

## [1] 3

### The subset of n_numeric column has it mean and median less than 
### the ones of the original data

#5 Using factor
mysubdata$n.numeric <- factor(mysubdata$n.numeric)
str(mysubdata)

## 'data.frame':    100 obs. of  6 variables:
##  $ Item       : Factor w/ 1297 levels "a10","abbey",..: 11 19 22 23 43 44 108 120 158 161 ...
##  $ description: Factor w/ 1234 levels "'colors()' in Luv space",..: 731 359 466 466 187 924 129 855 1019 1001 ...
##  $ num.row    : int  60 570 12 24 8437 23 100 49 823 10 ...
##  $ num.col    : int  3 6 1 1 4 3 4 2 3 5 ...
##  $ n.numeric  : Factor w/ 11 levels "1","2","3","4",..: 2 6 1 1 4 3 4 2 3 4 ...
##  $ rc.product : int  180 3420 12 24 33748 69 400 98 2469 50 ...

levels(mysubdata$n.numeric) <-c("Null","weakest","weaker", "weak", "fair", "average", "acceptable", "strong", "stronger", "strongest", "perfect")
 levels(mysubdata$n.numeric)

##  [1] "Null"       "weakest"    "weaker"     "weak"       "fair"      
##  [6] "average"    "acceptable" "strong"     "stronger"   "strongest" 
## [11] "perfect"

#6 Displaying my dataframe with 5 rows
head(mysubdata)

##         Item                                     description num.row
## 1       acme                          Monthly Excess Returns      60
## 2       aids    Delay in AIDS Reporting in England and Wales     570
## 3  aircondit          Failures of Air-conditioning Equipment      12
## 4 aircondit7          Failures of Air-conditioning Equipment      24
## 5       amis                  Car Speeding and Warning Signs    8437
## 6        aml Remission Times for Acute Myelogenous Leukaemia      23
##   num.col n.numeric rc.product
## 1       3   weakest        180
## 2       6   average       3420
## 3       1      Null         12
## 4       1      Null         24
## 5       4      weak      33748
## 6       3    weaker         69

#or
mysubdata[1:5,]

##         Item                                  description num.row num.col
## 1       acme                       Monthly Excess Returns      60       3
## 2       aids Delay in AIDS Reporting in England and Wales     570       6
## 3  aircondit       Failures of Air-conditioning Equipment      12       1
## 4 aircondit7       Failures of Air-conditioning Equipment      24       1
## 5       amis               Car Speeding and Warning Signs    8437       4
##   n.numeric rc.product
## 1   weakest        180
## 2   average       3420
## 3      Null         12
## 4      Null         24
## 5      weak      33748

#7 Get dataset file from github
library(readr)
gitDatasets <- read.csv("https://raw.githubusercontent.com/AlainKuiete/SummerBridge/master/datasets.csv")
summary(gitDatasets)

##       Package          Item     
##  Stat2Data:211   lung    :   3  
##  Ecdat    :130   aids    :   2  
##  DAAG     :121   channing:   2  
##  MASS     : 87   Cigar   :   2  
##  datasets : 84   cities  :   2  
##  carData  : 62   Clothing:   2  
##  (Other)  :645   (Other) :1327  
##                                                               Title     
##  Labour Training Evaluation Data                                 :  11  
##  Seven data sets showing a bifactor solution.                    :   9  
##  Individual Preferences Over Immigration Policy                  :   6  
##  John Snow's Map and Data on the 1854 London Cholera Outbreak    :   5  
##  Rain, wavesurge, portpirie and nidd datasets.                   :   4  
##  Australian and Related Historical Annual Climate Data, by region:   3  
##  (Other)                                                         :1302  
##       Rows               Cols            n_binary        n_character  
##  Min.   :     0.0   Min.   :   1.00   Min.   :  0.000   Min.   : 0.0  
##  1st Qu.:    30.0   1st Qu.:   3.00   1st Qu.:  0.000   1st Qu.: 0.0  
##  Median :    91.0   Median :   5.00   Median :  0.000   Median : 0.0  
##  Mean   :  1666.7   Mean   :  14.83   Mean   :  2.092   Mean   : 0.1  
##  3rd Qu.:   435.5   3rd Qu.:   9.00   3rd Qu.:  2.000   3rd Qu.: 0.0  
##  Max.   :372864.0   Max.   :6831.00   Max.   :624.000   Max.   :16.0  
##                                                                       
##     n_factor       n_logical         n_numeric     
##  Min.   : 0.00   Min.   :0.00000   Min.   :   0.0  
##  1st Qu.: 0.00   1st Qu.:0.00000   1st Qu.:   2.0  
##  Median : 0.00   Median :0.00000   Median :   4.0  
##  Mean   : 1.19   Mean   :0.01343   Mean   :  13.5  
##  3rd Qu.: 2.00   3rd Qu.:0.00000   3rd Qu.:   7.0  
##  Max.   :64.00   Max.   :4.00000   Max.   :6830.0  
##                                                    
##                                                                                  CSV      
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/acme.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aids.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit.csv :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aircondit7.csv:   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/boot/aml.csv       :   1  
##  (Other)                                                                           :1334  
##                                                                                   Doc      
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/acme.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aids.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit.html :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aircondit7.html:   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/amis.html      :   1  
##  https://raw.github.com/vincentarelbundock/Rdatasets/master/doc/boot/aml.html       :   1  
##  (Other)                                                                            :1334

Summer Bridge R Assignment Week 2

Alain T Kuiete

7/27/2019