MSDS Homework Week2

Data

Here is a list of data sets: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list) Please select one, download it and perform the following tasks:
Packages Used

Tidyverse

stringr

xml

data.table

Loading Packages


install.packages("tidyverse")
suppressWarnings(library(tidyverse))
suppressWarnings(library(stringr))
suppressWarnings(library(data.table))
library(plyr)

Question 1

Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes

Read File

# Downloading the file for Location and Name it Migraine_Headaches.csv

getwd() # Working Directory

## [1] "C:/Users/951250/Documents/myR"

file <- "https://vincentarelbundock.github.io/Rdatasets/csv/carData/KosteckiDillon.csv"
download.file( url= file, destfile="data/Migraine_Headaches.csv" )
dataCSVLocal <- read.csv(paste0(getwd(),"/data/Migraine_Headaches.csv"))

head(dataCSVLocal)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

#Open it from Web
dataCSV <- read.csv(file)
identical(dataCSVLocal,dataCSV)

## [1] TRUE

class(dataCSV)

## [1] "data.frame"

Summary of Data

Head & Tail

head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

tail(dataCSV)

##         X  id time  dos hatype age airq medication headache  sex
## 4147 4147 133   43 1231   Aura  26   19 continuing      yes male
## 4148 4148 133   44 1232   Aura  26   20 continuing      yes male
## 4149 4149 133   45 1233   Aura  26   27 continuing       no male
## 4150 4150 133   46 1234   Aura  26   28 continuing       no male
## 4151 4151 133   47 1235   Aura  26   33 continuing       no male
## 4152 4152 133   48 1236   Aura  26   25 continuing      yes male

summary

summary(dataCSV)

##        X              id              time             dos        
##  Min.   :   1   Min.   :  1.00   Min.   :-29.00   Min.   :  98.0  
##  1st Qu.:1039   1st Qu.: 33.00   1st Qu.:  3.00   1st Qu.: 384.0  
##  Median :2076   Median : 67.00   Median : 12.00   Median : 623.0  
##  Mean   :2076   Mean   : 66.39   Mean   : 15.46   Mean   : 646.7  
##  3rd Qu.:3114   3rd Qu.:100.00   3rd Qu.: 24.00   3rd Qu.: 950.0  
##  Max.   :4152   Max.   :133.00   Max.   : 99.00   Max.   :1239.0  
##      hatype          age             airq            medication  
##  Aura   :1710   Min.   :18.00   Min.   : 3.00   continuing:2386  
##  Mixed  : 457   1st Qu.:33.00   1st Qu.:18.00   none      : 785  
##  No Aura:1985   Median :44.00   Median :24.00   reduced   : 981  
##                 Mean   :42.36   Mean   :24.83                    
##                 3rd Qu.:50.00   3rd Qu.:29.00                    
##                 Max.   :66.00   Max.   :73.00                    
##  headache       sex      
##  no :1486   female:3545  
##  yes:2666   male  : 607  
##                          
##                          
##                          
##

Dimenssion

dim(dataCSV)

## [1] 4152   10

Structure

str(dataCSV)

## 'data.frame':    4152 obs. of  10 variables:
##  $ X         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ time      : int  -11 -10 -9 -8 -7 -6 -5 22 23 24 ...
##  $ dos       : int  753 754 755 756 757 758 759 786 787 788 ...
##  $ hatype    : Factor w/ 3 levels "Aura","Mixed",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ age       : int  30 30 30 30 30 30 30 30 30 30 ...
##  $ airq      : num  9 7 10 13 18 19 17 21 21 18 ...
##  $ medication: Factor w/ 3 levels "continuing","none",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ headache  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ sex       : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...

Rows

nrow(dataCSV)

## [1] 4152

Columns

ncol(dataCSV)

## [1] 10

Mean

#...........................................mean  (Time and AGE)
head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

data.Mean.Time.Age <- sapply( list( dataCSV$time,dataCSV$age ), mean)
names(data.Mean.Time.Age ) <- c("Time", "Age")

for (n in 1:length(data.Mean.Time.Age))
{
  print(sprintf(" Mean of %s is %f", names(data.Mean.Time.Age[n]) ,data.Mean.Time.Age[n]))
}

## [1] " Mean of Time is 15.455684"
## [1] " Mean of Age is 42.363921"

Median

#...........................................Meadion (Time and AGE)
head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

data.Md.Time.Age <- lapply( list( "Time" = dataCSV$time,"Age" = dataCSV$age ), median)
class(data.Md.Time.Age)

## [1] "list"

length(data.Md.Time.Age)

## [1] 2

data.Md.Time.Age

## $Time
## [1] 12
## 
## $Age
## [1] 44

for (n in 1: length(data.Md.Time.Age))
{
  print( paste("Median for", names(data.Md.Time.Age[n])," - ", data.Md.Time.Age[n]))
}

## [1] "Median for Time  -  12"
## [1] "Median for Age  -  44"

Question 2

Create a new data frame with a subset of the columns and rows. Make sure to rename it

Solution 1

Objective

# Get all the Age greater than 30 from the dataset.
# Create a subset of this data and put it in dataframe

Data

#......................Using subset to create subset of column (Solution 1)
head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

Using Subset & Rename

# Get id,age,dos the data for Age greater than 30 from the dataset.
dataCSV.COL.Sub.Id_Age_Dos <- subset(dataCSV,  dataCSV$age > 30 , select = c(id,age,dos) )

# Get id,age,medication,sex the data for Age greater than 30 from the dataset.
dataCSV.COL.Sub.Id_Age_Med_Sex <- subset(dataCSV,  dataCSV$age > 30 , select = c(id,age,medication,sex) )


# Creating new data Frame with some column from above subset data dataframe
dataCSV.COL.Sub.frame <- data.frame(
  dataCSV.COL.Sub.Id_Age_Dos[c("id","age")],
  "Medicince Status" = dataCSV.COL.Sub.Id_Age_Med_Sex$medication,
  "Sex" = dataCSV.COL.Sub.Id_Age_Med_Sex$sex)

head(dataCSV.COL.Sub.Id_Age_Dos)

##    id age dos
## 21  2  36 128
## 22  2  36 129
## 23  2  36 130
## 24  2  36 131
## 25  2  36 132
## 26  2  36 133

head(dataCSV.COL.Sub.Id_Age_Med_Sex)

##    id age medication    sex
## 21  2  36 continuing female
## 22  2  36 continuing female
## 23  2  36 continuing female
## 24  2  36 continuing female
## 25  2  36 continuing female
## 26  2  36 continuing female

head(dataCSV.COL.Sub.frame)

##    id age Medicince.Status    Sex
## 21  2  36       continuing female
## 22  2  36       continuing female
## 23  2  36       continuing female
## 24  2  36       continuing female
## 25  2  36       continuing female
## 26  2  36       continuing female

# Renaming the Columns
names(dataCSV.COL.Sub.frame) <- c("ID","Age","Medicine","Sex")
head(dataCSV.COL.Sub.frame)

##    ID Age   Medicine    Sex
## 21  2  36 continuing female
## 22  2  36 continuing female
## 23  2  36 continuing female
## 24  2  36 continuing female
## 25  2  36 continuing female
## 26  2  36 continuing female

#Rename using PLAYR
require(plyr)
 head(dataCSV.COL.Sub.frame)

##    ID Age   Medicine    Sex
## 21  2  36 continuing female
## 22  2  36 continuing female
## 23  2  36 continuing female
## 24  2  36 continuing female
## 25  2  36 continuing female
## 26  2  36 continuing female

 dataCSV.COL.Sub.frame <- rename(dataCSV.COL.Sub.frame,c("ID"="Patient_ID", "Medicine"="Staus"))
 
  head(dataCSV.COL.Sub.frame)

##    Patient_ID Age      Staus    Sex
## 21          2  36 continuing female
## 22          2  36 continuing female
## 23          2  36 continuing female
## 24          2  36 continuing female
## 25          2  36 continuing female
## 26          2  36 continuing female

Solution 2

Using cbind and changing Names

head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

dataCSV.COL.1 <- dataCSV[c("id","dos")]
names(dataCSV.COL.1) <- c( "ID" , "Dos")  # naming the Data set 
dataCSV.COL.2 <- dataCSV$time 
dataSubset <- cbind( dataCSV.COL.1, "Time" = dataCSV.COL.2)
class(dataSubset)

## [1] "data.frame"

class(dataSubset) # Data Frame

## [1] "data.frame"

head(dataSubset)  #Displaying data

##   ID Dos Time
## 1  1 753  -11
## 2  1 754  -10
## 3  1 755   -9
## 4  1 756   -8
## 5  1 757   -7
## 6  1 758   -6

Solution 3

#...........................................Using Join / Merge / aggregate  (Solution 3)

dataID.sum.Time <- aggregate(list("Sum Time" = dataCSV$time ),by = list("ID" = dataCSV$id ) , sum)
dataID.mean.Time <- aggregate( time ~ id  , data = dataCSV, mean)
dataID.age <- aggregate(age ~ id ,  data = dataCSV, mean)
names(dataID.age) <- c("ID", "AGE")
names(dataID.mean.Time) <- c("ID", "Mean Time")
head(dataID.sum.Time)

##   ID Sum.Time
## 1  1      316
## 2  2     1428
## 3  3     2485
## 4  4      574
## 5  5     1830
## 6  6      528

head(dataID.mean.Time)

##   ID Mean Time
## 1  1  15.80000
## 2  2  31.73333
## 3  3  35.50000
## 4  4  17.39394
## 5  5  30.50000
## 6  6  16.50000

head(dataID.age )

##   ID AGE
## 1  1  30
## 2  2  36
## 3  3  28
## 4  4  33
## 5  5  63
## 6  6  62

# merging data of dataID.sum.Time to dataID.age 
dataMergeAge <- merge(x=dataID.sum.Time, y= dataID.age, by.x = c("Patient ID" = "ID") , by.y = c("Patient ID" = "ID") )


# merging data  dataMergeAge to dataID.mean.Time show Average time and Total Time taken by Each patient along with their AGE
dataMerge <- merge(x=dataMergeAge, y= dataID.mean.Time, by.x = c("Patient ID" = "ID") , by.y = c("Patient ID" = "ID") )
names(dataMerge) <- c("Patient ID","Total Time","Age","Avg Time")
head(dataMerge)   # Soluiton

##   Patient ID Total Time Age Avg Time
## 1          1        316  30 15.80000
## 2          2       1428  36 31.73333
## 3          3       2485  28 35.50000
## 4          4        574  33 17.39394
## 5          5       1830  63 30.50000
## 6          6        528  62 16.50000

# Using Join from playr
require(plyr)
dataJoin <- join(x = dataID.sum.Time, y = dataID.mean.Time, by = c("ID"))
names(dataJoin) <- c("Patient ID","Total Time","Avg Time")
head(dataJoin)

##   Patient ID Total Time Avg Time
## 1          1        316 15.80000
## 2          2       1428 31.73333
## 3          3       2485 35.50000
## 4          4        574 17.39394
## 5          5       1830 30.50000
## 6          6        528 16.50000

Solution 4

#using setNames 
glimpse(dataSubset)

## Observations: 4,152
## Variables: 3
## $ ID   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Dos  <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 79...
## $ Time <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29,...

NameDataCSV <- setNames(dataSubset,letters[1:length(dataSubset)])
glimpse(NameDataCSV)

## Observations: 4,152
## Variables: 3
## $ a <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ b <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 790, ...
## $ c <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29, 30...

Question 3

Create new column names for the new data frame.

Solution 1

Using Plyr package

head(dataMerge) # using Data from solution 3 of question 2

##   Patient ID Total Time Age Avg Time
## 1          1        316  30 15.80000
## 2          2       1428  36 31.73333
## 3          3       2485  28 35.50000
## 4          4        574  33 17.39394
## 5          5       1830  63 30.50000
## 6          6        528  62 16.50000

names(dataMerge) # Know current name of the columns

## [1] "Patient ID" "Total Time" "Age"        "Avg Time"

class(dataMerge)  # making sure I am working with Data Frame

## [1] "data.frame"

# Using Plyr package  
#require(plyr)
library(plyr)
names(dataMerge) # Know  name before Change of the columns

## [1] "Patient ID" "Total Time" "Age"        "Avg Time"

dataMerge <- rename(dataMerge,c("Patient ID" = "P_ID", "Total Time" = "Medicine Time"))
names(dataMerge) # New name of the columns

## [1] "P_ID"          "Medicine Time" "Age"           "Avg Time"

Solution 2

Using base R function

head(dataMerge) # using Data from solution 2 of question 2

##   P_ID Medicine Time Age Avg Time
## 1    1           316  30 15.80000
## 2    2          1428  36 31.73333
## 3    3          2485  28 35.50000
## 4    4           574  33 17.39394
## 5    5          1830  63 30.50000
## 6    6           528  62 16.50000

names(dataMerge) # Know current name of the columns

## [1] "P_ID"          "Medicine Time" "Age"           "Avg Time"

class(dataMerge)  # making sure I am working with Data Frame

## [1] "data.frame"

# Creating New Name for Columns
names(dataMerge) = c("P ID","Time Spent on Medicine", "Age of Patient","Avg. Time")
names(dataMerge) # New name of the columns

## [1] "P ID"                   "Time Spent on Medicine"
## [3] "Age of Patient"         "Avg. Time"

Question 4

Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

Data Check

head(dataMerge)

##   P ID Time Spent on Medicine Age of Patient Avg. Time
## 1    1                    316             30  15.80000
## 2    2                   1428             36  31.73333
## 3    3                   2485             28  35.50000
## 4    4                    574             33  17.39394
## 5    5                   1830             63  30.50000
## 6    6                    528             62  16.50000

Summary

#.............................Using Summary
summary(dataMerge)

##       P ID     Time Spent on Medicine Age of Patient    Avg. Time     
##  Min.   :  1   Min.   :-244.0         Min.   :18.00   Min.   :-6.333  
##  1st Qu.: 34   1st Qu.: 126.0         1st Qu.:33.00   1st Qu.: 6.741  
##  Median : 67   Median : 210.0         Median :44.00   Median :10.000  
##  Mean   : 67   Mean   : 482.5         Mean   :42.16   Mean   :12.571  
##  3rd Qu.:100   3rd Qu.: 574.0         3rd Qu.:50.00   3rd Qu.:14.706  
##  Max.   :133   Max.   :4719.0         Max.   :66.00   Max.   :55.833

str(dataMerge)

## 'data.frame':    133 obs. of  4 variables:
##  $ P ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Time Spent on Medicine: int  316 1428 2485 574 1830 528 1596 152 1053 888 ...
##  $ Age of Patient        : num  30 36 28 33 63 62 42 43 43 30 ...
##  $ Avg. Time             : num  15.8 31.7 35.5 17.4 30.5 ...

names(dataMerge)

## [1] "P ID"                   "Time Spent on Medicine"
## [3] "Age of Patient"         "Avg. Time"

summary(dataMerge,digits = 0,maxsum = 2)  # Disaplying No Decimals and Factors upto 2 if any

##       P ID     Time Spent on Medicine Age of Patient   Avg. Time 
##  Min.   :  1   Min.   :-244           Min.   :18     Min.   :-6  
##  1st Qu.: 34   1st Qu.: 126           1st Qu.:33     1st Qu.: 7  
##  Median : 67   Median : 210           Median :44     Median :10  
##  Mean   : 67   Mean   : 482           Mean   :42     Mean   :13  
##  3rd Qu.:100   3rd Qu.: 574           3rd Qu.:50     3rd Qu.:15  
##  Max.   :133   Max.   :4719           Max.   :66     Max.   :56

#Disaply only 3 Age as factor  
summary( as.factor(dataMerge$`Age`),maxsum=3)

##      46      43 (Other) 
##      11       7     115

Mean

#Mean
dataMerge.Mean.Time <- mean(dataMerge$`Time Spent on Medicine`)
sprintf("Mean of Time %.2f",dataMerge.Mean.Time)

## [1] "Mean of Time 482.50"

dataMerge.Mean.Age <- mean(dataMerge$`Age of Patient`)
sprintf("Mean of Age  %.2f",dataMerge.Mean.Age)

## [1] "Mean of Age  42.16"

#Hold new Mean Data in data Frame 
dataMerge.Mean.Table <- data.frame(dataMerge.Mean.Age,dataMerge.Mean.Time) 
names(dataMerge.Mean.Table) <- c("Age","Time")
class(dataMerge.Mean.Table)

## [1] "data.frame"

head(dataMerge.Mean.Table)

##        Age     Time
## 1 42.15789 482.4962

Median

#Median
dataMerge.Md.Time <- median(dataMerge$`Time Spent on Medicine`)
sprintf("Median of Time  %.0f",dataMerge.Md.Time)

## [1] "Median of Time  210"

dataMerge.Md.Age <- median(dataMerge$`Age of Patient`)
sprintf("Median of Age %.0f",dataMerge.Md.Age)

## [1] "Median of Age 44"

#Hold new Median Data in data Frame 
dataMerge.Md.Table <- data.frame(dataMerge.Md.Age,dataMerge.Md.Time) 
names(dataMerge.Md.Table) <- c("Age","Time")
class(dataMerge.Md.Table)

## [1] "data.frame"

head(dataMerge.Md.Table)

##   Age Time
## 1  44  210

Compare

Comapre the Two Mean and Median with Result of Question 1, which is stored in data frame “data.Mean.Time.Age” “data.Md.Dos.Age”

Mean

#Comapre the Two Mean and Median with Result of Question 1, 
#which is stored in data frame "data.Mean.Time.Age" "data.Md.Dos.Age "

  class(data.Mean.Time.Age)                   # Mean from Base data

## [1] "numeric"

  class(data.Md.Time.Age)                     # Median from Base data

## [1] "list"

  class(dataMerge.Mean.Table)                 # Mean from SUBSET data

## [1] "data.frame"

  class(dataMerge.Md.Table)                   # Median from SUBSET data

## [1] "data.frame"

  for (n in 1:length(data.Mean.Time.Age))
{ 
    name <- names(data.Mean.Time.Age[n])
    print(sprintf(" Mean of Base %s is %f", name ,data.Mean.Time.Age[n]))
    print(sprintf(" Mean of New Subset %s is %f", names(dataMerge.Mean.Table[name]), dataMerge.Mean.Table[as.character(name)]))
    
    if((identical(data.Mean.Time.Age[n],dataMerge.Mean.Table[n]))==TRUE){
       print(sprintf("Mean of %s does match",name))
      
    } else {
      print(sprintf("Mean of %s does not match",name))
      
    }
  }

## [1] " Mean of Base Time is 15.455684"
## [1] " Mean of New Subset Time is 482.496241"
## [1] "Mean of Time does not match"
## [1] " Mean of Base Age is 42.363921"
## [1] " Mean of New Subset Age is 42.157895"
## [1] "Mean of Age does not match"

Median

#................................Median of BASE AND SUBSET data 
  
  # Building Copy of Base Age Time data Median table
  data.Md.Time.AgeCopy <- data.frame(as.numeric(data.Md.Time.Age$Age),as.integer(data.Md.Time.Age$Time))
  
  names(data.Md.Time.AgeCopy) <- c("AgeC","TimeC")

  
  dataMerge.Md.Table             # Median for Subset data

##   Age Time
## 1  44  210

  data.Md.Time.AgeCopy          # Copy of Base Age Time data Median table

##   AgeC TimeC
## 1   44    12

  # Using MAPPLY to aply function to each item of the vector 
  print("Medain of the Base and Subset Data Campare result as below, Here True indicates match FALSE indicates no Match")

## [1] "Medain of the Base and Subset Data Campare result as below, Here True indicates match FALSE indicates no Match"

    mapply(function(x, y) {
      # DO the Comapre of the same column value 
      # Replace value to FALSE if Its NA
      value <- as.character(x) == as.character(y) 
      
         replace(value, is.na(value), FALSE)
         }, 
      dataMerge.Md.Table, 
      data.Md.Time.AgeCopy)

##   Age  Time 
##  TRUE FALSE

 # Comapre using direct data frames
    (dataMerge.Md.Table == data.Md.Time.AgeCopy) & !is.na(dataMerge.Md.Table) & !is.na(data.Md.Time.AgeCopy)

##       Age  Time
## [1,] TRUE FALSE

Question 5

For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.

require(stringr)
head(dataMerge)

##   P ID Time Spent on Medicine Age of Patient Avg. Time
## 1    1                    316             30  15.80000
## 2    2                   1428             36  31.73333
## 3    3                   2485             28  35.50000
## 4    4                    574             33  17.39394
## 5    5                   1830             63  30.50000
## 6    6                    528             62  16.50000

class(dataMerge)

## [1] "data.frame"

dataMerge.New <- dataMerge
#Find all Age in between 0-29, 30-59, 60-Max
head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[0-2][0-9]"),])

##    P ID Time Spent on Medicine Age of Patient Avg. Time
## 3     3                   2485             28  35.50000
## 12   12                    628             24  22.42857
## 13   13                    231             27  11.00000
## 14   14                    120             21  10.00000
## 27   27                    144             24   9.00000
## 38   38                    946             28  22.00000

head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[3-5][0-9]"),])

##   P ID Time Spent on Medicine Age of Patient Avg. Time
## 1    1                    316             30  15.80000
## 2    2                   1428             36  31.73333
## 4    4                    574             33  17.39394
## 7    7                   1596             42  28.50000
## 8    8                    152             43   9.50000
## 9    9                   1053             43  40.50000

head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[6-9][0-9]"),])

##    P ID Time Spent on Medicine Age of Patient Avg. Time
## 5     5                   1830             63   30.5000
## 6     6                    528             62   16.5000
## 21   21                    171             60    9.5000
## 30   30                    171             63    9.5000
## 53   53                    -22             62   -0.6875
## 72   72                    -28             60   -0.8000

dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[3-5][0-9]$",replacement = "Over Thirty")

dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[6-9][0-9]$",replacement = "Over Sixty")

dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[0-2][0-9]$",replacement = "Under Thirty")

dataMerge.New $`Age of Patient`  <- str_replace_all(dataMerge.New$`Age of Patient`,pattern = "Under Thirty",replacement ="MSDS")

head(dataMerge.New,200)

##     P ID Time Spent on Medicine Age of Patient  Avg. Time
## 1      1                    316    Over Thirty 15.8000000
## 2      2                   1428    Over Thirty 31.7333333
## 3      3                   2485           MSDS 35.5000000
## 4      4                    574    Over Thirty 17.3939394
## 5      5                   1830     Over Sixty 30.5000000
## 6      6                    528     Over Sixty 16.5000000
## 7      7                   1596    Over Thirty 28.5000000
## 8      8                    152    Over Thirty  9.5000000
## 9      9                   1053    Over Thirty 40.5000000
## 10    10                    888    Over Thirty 11.1000000
## 11    11                    190    Over Thirty 10.0000000
## 12    12                    628           MSDS 22.4285714
## 13    13                    231           MSDS 11.0000000
## 14    14                    120           MSDS 10.0000000
## 15    15                    190    Over Thirty 10.0000000
## 16    16                    231    Over Thirty 11.0000000
## 17    17                    595    Over Thirty 17.5000000
## 18    18                    300    Over Thirty 12.5000000
## 19    19                    253    Over Thirty 11.5000000
## 20    20                    351    Over Thirty 13.5000000
## 21    21                    171     Over Sixty  9.5000000
## 22    22                   4694    Over Thirty 51.5824176
## 23    23                    190    Over Thirty 10.0000000
## 24    24                   3140    Over Thirty 49.8412698
## 25    25                    210    Over Thirty 10.5000000
## 26    26                   -244    Over Thirty -5.4222222
## 27    27                    144           MSDS  9.0000000
## 28    28                    325    Over Thirty 13.0000000
## 29    29                    171    Over Thirty  9.5000000
## 30    30                    171     Over Sixty  9.5000000
## 31    31                    190    Over Thirty 10.0000000
## 32    32                    241    Over Thirty 11.4761905
## 33    33                    250    Over Thirty 14.7058824
## 34    34                   1081    Over Thirty 23.5000000
## 35    35                    171    Over Thirty  9.5000000
## 36    36                    171    Over Thirty  9.5000000
## 37    37                    171    Over Thirty  9.5000000
## 38    38                    946           MSDS 22.0000000
## 39    39                    325    Over Thirty 13.0000000
## 40    40                    372    Over Thirty 12.0000000
## 41    41                     72    Over Thirty  4.5000000
## 42    42                    674           MSDS 25.9230769
## 43    43                    171    Over Thirty  9.5000000
## 44    44                    171    Over Thirty  9.5000000
## 45    45                    190    Over Thirty 10.0000000
## 46    46                    301    Over Thirty 12.5416667
## 47    47                    182    Over Thirty  6.7407407
## 48    48                    277    Over Thirty  7.9142857
## 49    49                   1540    Over Thirty 28.0000000
## 50    50                   1128    Over Thirty 24.0000000
## 51    51                    189    Over Thirty 10.5000000
## 52    52                    263    Over Thirty  7.9696970
## 53    53                    -22     Over Sixty -0.6875000
## 54    54                    175           MSDS 10.9375000
## 55    55                   4719    Over Thirty 39.0000000
## 56    56                    153    Over Thirty  9.0000000
## 57    57                    210    Over Thirty 10.5000000
## 58    58                    831    Over Thirty 16.2941176
## 59    59                    115           MSDS  3.3823529
## 60    60                    142    Over Thirty  7.4736842
## 61    61                    -84    Over Thirty -4.0000000
## 62    62                    351           MSDS 13.5000000
## 63    63                     88    Over Thirty  3.0344828
## 64    64                    231    Over Thirty 10.5000000
## 65    65                   1363    Over Thirty 26.7254902
## 66    66                    217    Over Thirty 10.3333333
## 67    67                   -167    Over Thirty -5.5666667
## 68    68                     71    Over Thirty  6.4545455
## 69    69                    156    Over Thirty  5.3793103
## 70    70                    263           MSDS  6.9210526
## 71    71                     97    Over Thirty  2.8529412
## 72    72                    -28     Over Sixty -0.8000000
## 73    73                    840     Over Sixty 17.8723404
## 74    74                   1330    Over Thirty 17.7333333
## 75    75                    126    Over Thirty  2.0322581
## 76    76                    182    Over Thirty  8.2727273
## 77    77                    210    Over Thirty 10.5000000
## 78    78                   -171    Over Thirty -3.7173913
## 79    79                      0    Over Thirty  0.0000000
## 80    80                    600    Over Thirty 25.0000000
## 81    81                     91    Over Thirty  7.0000000
## 82    82                    119    Over Thirty  3.6060606
## 83    83                     82    Over Thirty  6.8333333
## 84    84                    154    Over Thirty 11.0000000
## 85    85                    777    Over Thirty 14.3888889
## 86    86                    -63    Over Thirty -1.5000000
## 87    87                    153    Over Thirty  8.0526316
## 88    88                    391           MSDS 13.9642857
## 89    89                     39    Over Thirty  1.3448276
## 90    90                    274    Over Thirty  9.1333333
## 91    91                    190     Over Sixty 10.0000000
## 92    92                    210    Over Thirty 10.5000000
## 93    93                     20    Over Thirty  0.5128205
## 94    94                    161    Over Thirty  6.1923077
## 95    95                     -8           MSDS -0.6666667
## 96    96                    -42    Over Thirty -1.5000000
## 97    97                    264    Over Thirty  9.1034483
## 98    98                    456           MSDS  9.7021277
## 99    99                    190    Over Thirty 10.0000000
## 100  100                     85    Over Thirty  2.5757576
## 101  101                    594           MSDS 18.0000000
## 102  102                    158    Over Thirty  9.2941176
## 103  103                    161    Over Thirty  5.9629630
## 104  104                    231    Over Thirty 11.0000000
## 105  105                    125    Over Thirty  8.9285714
## 106  106                    378           MSDS 14.0000000
## 107  107                     70    Over Thirty 10.0000000
## 108  108                   -223    Over Thirty -4.6458333
## 109  109                    150    Over Thirty  6.2500000
## 110  110                   1540    Over Thirty 28.0000000
## 111  111                    465    Over Thirty 13.6764706
## 112  112                    335    Over Thirty  8.8157895
## 113  113                     78    Over Thirty  6.5000000
## 114  114                    189           MSDS  7.8750000
## 115  115                    210    Over Thirty 10.5000000
## 116  116                   3015    Over Thirty 55.8333333
## 117  117                    113    Over Thirty  5.3809524
## 118  118                     88           MSDS  2.6666667
## 119  119                   -133    Over Thirty -6.3333333
## 120  120                    802    Over Thirty 10.6933333
## 121  121                    444    Over Thirty 11.1000000
## 122  122                   -146     Over Sixty -4.8666667
## 123  123                    153           MSDS  9.0000000
## 124  124                     56    Over Thirty  1.7500000
## 125  125                    113    Over Thirty  6.2777778
## 126  126                   2332    Over Thirty 39.5254237
## 127  127                     20    Over Thirty  0.5000000
## 128  128                    595    Over Thirty 17.5000000
## 129  129                    714           MSDS 25.5000000
## 130  130                   1239    Over Thirty 29.5000000
## 131  131                   1153    Over Thirty 32.9428571
## 132  132                    810    Over Thirty 40.5000000
## 133  133                    815           MSDS 35.4347826

Question 6

Display enough rows to see examples of all of steps 1-5 above.

Q1

head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

#...........................................mean  (Time and AGE)
names(data.Mean.Time.Age ) <- c("Time", "Age")
head(data.Mean.Time.Age )

##     Time      Age 
## 15.45568 42.36392

#...........................................Meadion (Time and AGE)
head(data.Md.Time.Age)

## $Time
## [1] 12
## 
## $Age
## [1] 44

Q2

head(dataID.sum.Time)

##   ID Sum.Time
## 1  1      316
## 2  2     1428
## 3  3     2485
## 4  4      574
## 5  5     1830
## 6  6      528

head(dataID.mean.Time)

##   ID Mean Time
## 1  1  15.80000
## 2  2  31.73333
## 3  3  35.50000
## 4  4  17.39394
## 5  5  30.50000
## 6  6  16.50000

head(dataID.age )

##   ID AGE
## 1  1  30
## 2  2  36
## 3  3  28
## 4  4  33
## 5  5  63
## 6  6  62

head(dataCSV)

##   X id time dos hatype age airq medication headache    sex
## 1 1  1  -11 753   Aura  30    9 continuing      yes female
## 2 2  1  -10 754   Aura  30    7 continuing      yes female
## 3 3  1   -9 755   Aura  30   10 continuing      yes female
## 4 4  1   -8 756   Aura  30   13 continuing      yes female
## 5 5  1   -7 757   Aura  30   18 continuing      yes female
## 6 6  1   -6 758   Aura  30   19 continuing      yes female

head(dataCSV.COL.Sub.frame)

##    Patient_ID Age      Staus    Sex
## 21          2  36 continuing female
## 22          2  36 continuing female
## 23          2  36 continuing female
## 24          2  36 continuing female
## 25          2  36 continuing female
## 26          2  36 continuing female

head(dataJoin)

##   Patient ID Total Time Avg Time
## 1          1        316 15.80000
## 2          2       1428 31.73333
## 3          3       2485 35.50000
## 4          4        574 17.39394
## 5          5       1830 30.50000
## 6          6        528 16.50000

head(dataMerge)

##   P ID Time Spent on Medicine Age of Patient Avg. Time
## 1    1                    316             30  15.80000
## 2    2                   1428             36  31.73333
## 3    3                   2485             28  35.50000
## 4    4                    574             33  17.39394
## 5    5                   1830             63  30.50000
## 6    6                    528             62  16.50000

#Name
glimpse(NameDataCSV)

## Observations: 4,152
## Variables: 3
## $ a <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ b <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 790, ...
## $ c <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29, 30...

Q3

names(dataMerge) # New name of the columns

## [1] "P ID"                   "Time Spent on Medicine"
## [3] "Age of Patient"         "Avg. Time"

Q4

# Mean of  base and subset table 
data.Mean.Time.Age

##     Time      Age 
## 15.45568 42.36392

dataMerge.Mean.Table

##        Age     Time
## 1 42.15789 482.4962

# Median of  base and subset table 
data.Md.Time.AgeCopy

##   AgeC TimeC
## 1   44    12

dataMerge.Md.Table

##   Age Time
## 1  44  210

Q5

require(tidyverse)
glimpse(dataMerge)

## Observations: 133
## Variables: 4
## $ `P ID`                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
## $ `Time Spent on Medicine` <int> 316, 1428, 2485, 574, 1830, 528, 1596...
## $ `Age of Patient`         <dbl> 30, 36, 28, 33, 63, 62, 42, 43, 43, 3...
## $ `Avg. Time`              <dbl> 15.80000, 31.73333, 35.50000, 17.3939...

glimpse(dataMerge.New)

## Observations: 133
## Variables: 4
## $ `P ID`                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
## $ `Time Spent on Medicine` <int> 316, 1428, 2485, 574, 1830, 528, 1596...
## $ `Age of Patient`         <chr> "Over Thirty", "Over Thirty", "MSDS",...
## $ `Avg. Time`              <dbl> 15.80000, 31.73333, 35.50000, 17.3939...

head(dataMerge)

##   P ID Time Spent on Medicine Age of Patient Avg. Time
## 1    1                    316             30  15.80000
## 2    2                   1428             36  31.73333
## 3    3                   2485             28  35.50000
## 4    4                    574             33  17.39394
## 5    5                   1830             63  30.50000
## 6    6                    528             62  16.50000

head(dataMerge.New)

##   P ID Time Spent on Medicine Age of Patient Avg. Time
## 1    1                    316    Over Thirty  15.80000
## 2    2                   1428    Over Thirty  31.73333
## 3    3                   2485           MSDS  35.50000
## 4    4                    574    Over Thirty  17.39394
## 5    5                   1830     Over Sixty  30.50000
## 6    6                    528     Over Sixty  16.50000

Question 7

BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

Loading data from https://vincentarelbundock.github.io/Rdatasets/datasets.html

Uploaded to CSV Github , Raw CSV from GIT Hub

gitFile <- "https://github.com/Rajwantmishra/msds/blob/master/Migraine_Headaches.csv" gitRawFile <- "https://raw.githubusercontent.com/Rajwantmishra/msds/master/Migraine_Headaches.csv" #require(XML) read.csv.url <- read.csv( url(gitRawFile)) head(read.csv.url)

## X id time dos hatype age airq medication headache sex ## 1 1 1 -11 753 Aura 30 9 continuing yes female ## 2 2 1 -10 754 Aura 30 7 continuing yes female ## 3 3 1 -9 755 Aura 30 10 continuing yes female ## 4 4 1 -8 756 Aura 30 13 continuing yes female ## 5 5 1 -7 757 Aura 30 18 continuing yes female ## 6 6 1 -6 758 Aura 30 19 continuing yes female

#require(read.table) read.csv.Data <- read.csv(gitRawFile,header=T) head(read.csv.Data )

## X id time dos hatype age airq medication headache sex ## 1 1 1 -11 753 Aura 30 9 continuing yes female ## 2 2 1 -10 754 Aura 30 7 continuing yes female ## 3 3 1 -9 755 Aura 30 10 continuing yes female ## 4 4 1 -8 756 Aura 30 13 continuing yes female ## 5 5 1 -7 757 Aura 30 18 continuing yes female ## 6 6 1 -6 758 Aura 30 19 continuing yes female

#library(data.table) dataTableCSV <- fread(gitRawFile) head(dataTableCSV)

## V1 id time dos hatype age airq medication headache sex ## 1: 1 1 -11 753 Aura 30 9 continuing yes female ## 2: 2 1 -10 754 Aura 30 7 continuing yes female ## 3: 3 1 -9 755 Aura 30 10 continuing yes female ## 4: 4 1 -8 756 Aura 30 13 continuing yes female ## 5: 5 1 -7 757 Aura 30 18 continuing yes female ## 6: 6 1 -6 758 Aura 30 19 continuing yes female

library(tidyverse) tidyDataCSV <- read_csv(gitRawFile)

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification: ## cols( ## X1 = col_double(), ## id = col_double(), ## time = col_double(), ## dos = col_double(), ## hatype = col_character(), ## age = col_double(), ## airq = col_double(), ## medication = col_character(), ## headache = col_character(), ## sex = col_character() ## )

glimpse(tidyDataCSV)

## Observations: 4,152 ## Variables: 10 ## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ... ## $ id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... ## $ time <dbl> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 2... ## $ dos <dbl> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 7... ## $ hatype <chr> "Aura", "Aura", "Aura", "Aura", "Aura", "Aura", "Au... ## $ age <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,... ## $ airq <dbl> 9, 7, 10, 13, 18, 19, 17, 21, 21, 18, 14, 12, 22, 2... ## $ medication <chr> "continuing", "continuing", "continuing", "continui... ## $ headache <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "y... ## $ sex <chr> "female", "female", "female", "female", "female", "...

head(tidyDataCSV)

## # A tibble: 6 x 10 ## X1 id time dos hatype age airq medication headache sex ## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr> ## 1 1 1 -11 753 Aura 30 9 continuing yes female ## 2 2 1 -10 754 Aura 30 7 continuing yes female ## 3 3 1 -9 755 Aura 30 10 continuing yes female ## 4 4 1 -8 756 Aura 30 13 continuing yes female ## 5 5 1 -7 757 Aura 30 18 continuing yes female ## 6 6 1 -6 758 Aura 30 19 continuing yes female

MSDS Homework Week2

Rajwant Mishra

January 1, 2019

Data

Loading Packages

Question 1

Read File

Summary of Data

Head & Tail

summary

Dimenssion

Structure

Rows

Columns

Mean

Median

Question 2

Solution 1

Objective

Data

Using Subset & Rename

Solution 2

Solution 3

Solution 4

Question 3

Solution 1

Solution 2

Question 4

Data Check

Summary

Mean

Median

Compare

Mean

Median

Question 5

Question 6

Q1

Q2

Q3

Q4

Q5

Question 7