data.table
install.packages("tidyverse")
suppressWarnings(library(tidyverse))
suppressWarnings(library(stringr))
suppressWarnings(library(data.table))
library(plyr)
Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes
# Downloading the file for Location and Name it Migraine_Headaches.csv
getwd() # Working Directory
## [1] "C:/Users/951250/Documents/myR"
file <- "https://vincentarelbundock.github.io/Rdatasets/csv/carData/KosteckiDillon.csv"
download.file( url= file, destfile="data/Migraine_Headaches.csv" )
dataCSVLocal <- read.csv(paste0(getwd(),"/data/Migraine_Headaches.csv"))
head(dataCSVLocal)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
#Open it from Web
dataCSV <- read.csv(file)
identical(dataCSVLocal,dataCSV)
## [1] TRUE
class(dataCSV)
## [1] "data.frame"
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
tail(dataCSV)
## X id time dos hatype age airq medication headache sex
## 4147 4147 133 43 1231 Aura 26 19 continuing yes male
## 4148 4148 133 44 1232 Aura 26 20 continuing yes male
## 4149 4149 133 45 1233 Aura 26 27 continuing no male
## 4150 4150 133 46 1234 Aura 26 28 continuing no male
## 4151 4151 133 47 1235 Aura 26 33 continuing no male
## 4152 4152 133 48 1236 Aura 26 25 continuing yes male
summary(dataCSV)
## X id time dos
## Min. : 1 Min. : 1.00 Min. :-29.00 Min. : 98.0
## 1st Qu.:1039 1st Qu.: 33.00 1st Qu.: 3.00 1st Qu.: 384.0
## Median :2076 Median : 67.00 Median : 12.00 Median : 623.0
## Mean :2076 Mean : 66.39 Mean : 15.46 Mean : 646.7
## 3rd Qu.:3114 3rd Qu.:100.00 3rd Qu.: 24.00 3rd Qu.: 950.0
## Max. :4152 Max. :133.00 Max. : 99.00 Max. :1239.0
## hatype age airq medication
## Aura :1710 Min. :18.00 Min. : 3.00 continuing:2386
## Mixed : 457 1st Qu.:33.00 1st Qu.:18.00 none : 785
## No Aura:1985 Median :44.00 Median :24.00 reduced : 981
## Mean :42.36 Mean :24.83
## 3rd Qu.:50.00 3rd Qu.:29.00
## Max. :66.00 Max. :73.00
## headache sex
## no :1486 female:3545
## yes:2666 male : 607
##
##
##
##
dim(dataCSV)
## [1] 4152 10
str(dataCSV)
## 'data.frame': 4152 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int -11 -10 -9 -8 -7 -6 -5 22 23 24 ...
## $ dos : int 753 754 755 756 757 758 759 786 787 788 ...
## $ hatype : Factor w/ 3 levels "Aura","Mixed",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 30 30 30 30 30 30 30 30 30 30 ...
## $ airq : num 9 7 10 13 18 19 17 21 21 18 ...
## $ medication: Factor w/ 3 levels "continuing","none",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ headache : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ sex : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...
nrow(dataCSV)
## [1] 4152
ncol(dataCSV)
## [1] 10
#...........................................mean (Time and AGE)
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
data.Mean.Time.Age <- sapply( list( dataCSV$time,dataCSV$age ), mean)
names(data.Mean.Time.Age ) <- c("Time", "Age")
for (n in 1:length(data.Mean.Time.Age))
{
print(sprintf(" Mean of %s is %f", names(data.Mean.Time.Age[n]) ,data.Mean.Time.Age[n]))
}
## [1] " Mean of Time is 15.455684"
## [1] " Mean of Age is 42.363921"
#...........................................Meadion (Time and AGE)
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
data.Md.Time.Age <- lapply( list( "Time" = dataCSV$time,"Age" = dataCSV$age ), median)
class(data.Md.Time.Age)
## [1] "list"
length(data.Md.Time.Age)
## [1] 2
data.Md.Time.Age
## $Time
## [1] 12
##
## $Age
## [1] 44
for (n in 1: length(data.Md.Time.Age))
{
print( paste("Median for", names(data.Md.Time.Age[n])," - ", data.Md.Time.Age[n]))
}
## [1] "Median for Time - 12"
## [1] "Median for Age - 44"
Create a new data frame with a subset of the columns and rows. Make sure to rename it
# Get all the Age greater than 30 from the dataset.
# Create a subset of this data and put it in dataframe
#......................Using subset to create subset of column (Solution 1)
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
# Get id,age,dos the data for Age greater than 30 from the dataset.
dataCSV.COL.Sub.Id_Age_Dos <- subset(dataCSV, dataCSV$age > 30 , select = c(id,age,dos) )
# Get id,age,medication,sex the data for Age greater than 30 from the dataset.
dataCSV.COL.Sub.Id_Age_Med_Sex <- subset(dataCSV, dataCSV$age > 30 , select = c(id,age,medication,sex) )
# Creating new data Frame with some column from above subset data dataframe
dataCSV.COL.Sub.frame <- data.frame(
dataCSV.COL.Sub.Id_Age_Dos[c("id","age")],
"Medicince Status" = dataCSV.COL.Sub.Id_Age_Med_Sex$medication,
"Sex" = dataCSV.COL.Sub.Id_Age_Med_Sex$sex)
head(dataCSV.COL.Sub.Id_Age_Dos)
## id age dos
## 21 2 36 128
## 22 2 36 129
## 23 2 36 130
## 24 2 36 131
## 25 2 36 132
## 26 2 36 133
head(dataCSV.COL.Sub.Id_Age_Med_Sex)
## id age medication sex
## 21 2 36 continuing female
## 22 2 36 continuing female
## 23 2 36 continuing female
## 24 2 36 continuing female
## 25 2 36 continuing female
## 26 2 36 continuing female
head(dataCSV.COL.Sub.frame)
## id age Medicince.Status Sex
## 21 2 36 continuing female
## 22 2 36 continuing female
## 23 2 36 continuing female
## 24 2 36 continuing female
## 25 2 36 continuing female
## 26 2 36 continuing female
# Renaming the Columns
names(dataCSV.COL.Sub.frame) <- c("ID","Age","Medicine","Sex")
head(dataCSV.COL.Sub.frame)
## ID Age Medicine Sex
## 21 2 36 continuing female
## 22 2 36 continuing female
## 23 2 36 continuing female
## 24 2 36 continuing female
## 25 2 36 continuing female
## 26 2 36 continuing female
#Rename using PLAYR
require(plyr)
head(dataCSV.COL.Sub.frame)
## ID Age Medicine Sex
## 21 2 36 continuing female
## 22 2 36 continuing female
## 23 2 36 continuing female
## 24 2 36 continuing female
## 25 2 36 continuing female
## 26 2 36 continuing female
dataCSV.COL.Sub.frame <- rename(dataCSV.COL.Sub.frame,c("ID"="Patient_ID", "Medicine"="Staus"))
head(dataCSV.COL.Sub.frame)
## Patient_ID Age Staus Sex
## 21 2 36 continuing female
## 22 2 36 continuing female
## 23 2 36 continuing female
## 24 2 36 continuing female
## 25 2 36 continuing female
## 26 2 36 continuing female
Using cbind and changing Names
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
dataCSV.COL.1 <- dataCSV[c("id","dos")]
names(dataCSV.COL.1) <- c( "ID" , "Dos") # naming the Data set
dataCSV.COL.2 <- dataCSV$time
dataSubset <- cbind( dataCSV.COL.1, "Time" = dataCSV.COL.2)
class(dataSubset)
## [1] "data.frame"
class(dataSubset) # Data Frame
## [1] "data.frame"
head(dataSubset) #Displaying data
## ID Dos Time
## 1 1 753 -11
## 2 1 754 -10
## 3 1 755 -9
## 4 1 756 -8
## 5 1 757 -7
## 6 1 758 -6
#...........................................Using Join / Merge / aggregate (Solution 3)
dataID.sum.Time <- aggregate(list("Sum Time" = dataCSV$time ),by = list("ID" = dataCSV$id ) , sum)
dataID.mean.Time <- aggregate( time ~ id , data = dataCSV, mean)
dataID.age <- aggregate(age ~ id , data = dataCSV, mean)
names(dataID.age) <- c("ID", "AGE")
names(dataID.mean.Time) <- c("ID", "Mean Time")
head(dataID.sum.Time)
## ID Sum.Time
## 1 1 316
## 2 2 1428
## 3 3 2485
## 4 4 574
## 5 5 1830
## 6 6 528
head(dataID.mean.Time)
## ID Mean Time
## 1 1 15.80000
## 2 2 31.73333
## 3 3 35.50000
## 4 4 17.39394
## 5 5 30.50000
## 6 6 16.50000
head(dataID.age )
## ID AGE
## 1 1 30
## 2 2 36
## 3 3 28
## 4 4 33
## 5 5 63
## 6 6 62
# merging data of dataID.sum.Time to dataID.age
dataMergeAge <- merge(x=dataID.sum.Time, y= dataID.age, by.x = c("Patient ID" = "ID") , by.y = c("Patient ID" = "ID") )
# merging data dataMergeAge to dataID.mean.Time show Average time and Total Time taken by Each patient along with their AGE
dataMerge <- merge(x=dataMergeAge, y= dataID.mean.Time, by.x = c("Patient ID" = "ID") , by.y = c("Patient ID" = "ID") )
names(dataMerge) <- c("Patient ID","Total Time","Age","Avg Time")
head(dataMerge) # Soluiton
## Patient ID Total Time Age Avg Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
# Using Join from playr
require(plyr)
dataJoin <- join(x = dataID.sum.Time, y = dataID.mean.Time, by = c("ID"))
names(dataJoin) <- c("Patient ID","Total Time","Avg Time")
head(dataJoin)
## Patient ID Total Time Avg Time
## 1 1 316 15.80000
## 2 2 1428 31.73333
## 3 3 2485 35.50000
## 4 4 574 17.39394
## 5 5 1830 30.50000
## 6 6 528 16.50000
#using setNames
glimpse(dataSubset)
## Observations: 4,152
## Variables: 3
## $ ID <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Dos <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 79...
## $ Time <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29,...
NameDataCSV <- setNames(dataSubset,letters[1:length(dataSubset)])
glimpse(NameDataCSV)
## Observations: 4,152
## Variables: 3
## $ a <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ b <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 790, ...
## $ c <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29, 30...
Create new column names for the new data frame.
Using Plyr package
head(dataMerge) # using Data from solution 3 of question 2
## Patient ID Total Time Age Avg Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
names(dataMerge) # Know current name of the columns
## [1] "Patient ID" "Total Time" "Age" "Avg Time"
class(dataMerge) # making sure I am working with Data Frame
## [1] "data.frame"
# Using Plyr package
#require(plyr)
library(plyr)
names(dataMerge) # Know name before Change of the columns
## [1] "Patient ID" "Total Time" "Age" "Avg Time"
dataMerge <- rename(dataMerge,c("Patient ID" = "P_ID", "Total Time" = "Medicine Time"))
names(dataMerge) # New name of the columns
## [1] "P_ID" "Medicine Time" "Age" "Avg Time"
Using base R function
head(dataMerge) # using Data from solution 2 of question 2
## P_ID Medicine Time Age Avg Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
names(dataMerge) # Know current name of the columns
## [1] "P_ID" "Medicine Time" "Age" "Avg Time"
class(dataMerge) # making sure I am working with Data Frame
## [1] "data.frame"
# Creating New Name for Columns
names(dataMerge) = c("P ID","Time Spent on Medicine", "Age of Patient","Avg. Time")
names(dataMerge) # New name of the columns
## [1] "P ID" "Time Spent on Medicine"
## [3] "Age of Patient" "Avg. Time"
Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.
head(dataMerge)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
#.............................Using Summary
summary(dataMerge)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## Min. : 1 Min. :-244.0 Min. :18.00 Min. :-6.333
## 1st Qu.: 34 1st Qu.: 126.0 1st Qu.:33.00 1st Qu.: 6.741
## Median : 67 Median : 210.0 Median :44.00 Median :10.000
## Mean : 67 Mean : 482.5 Mean :42.16 Mean :12.571
## 3rd Qu.:100 3rd Qu.: 574.0 3rd Qu.:50.00 3rd Qu.:14.706
## Max. :133 Max. :4719.0 Max. :66.00 Max. :55.833
str(dataMerge)
## 'data.frame': 133 obs. of 4 variables:
## $ P ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Time Spent on Medicine: int 316 1428 2485 574 1830 528 1596 152 1053 888 ...
## $ Age of Patient : num 30 36 28 33 63 62 42 43 43 30 ...
## $ Avg. Time : num 15.8 31.7 35.5 17.4 30.5 ...
names(dataMerge)
## [1] "P ID" "Time Spent on Medicine"
## [3] "Age of Patient" "Avg. Time"
summary(dataMerge,digits = 0,maxsum = 2) # Disaplying No Decimals and Factors upto 2 if any
## P ID Time Spent on Medicine Age of Patient Avg. Time
## Min. : 1 Min. :-244 Min. :18 Min. :-6
## 1st Qu.: 34 1st Qu.: 126 1st Qu.:33 1st Qu.: 7
## Median : 67 Median : 210 Median :44 Median :10
## Mean : 67 Mean : 482 Mean :42 Mean :13
## 3rd Qu.:100 3rd Qu.: 574 3rd Qu.:50 3rd Qu.:15
## Max. :133 Max. :4719 Max. :66 Max. :56
#Disaply only 3 Age as factor
summary( as.factor(dataMerge$`Age`),maxsum=3)
## 46 43 (Other)
## 11 7 115
#Mean
dataMerge.Mean.Time <- mean(dataMerge$`Time Spent on Medicine`)
sprintf("Mean of Time %.2f",dataMerge.Mean.Time)
## [1] "Mean of Time 482.50"
dataMerge.Mean.Age <- mean(dataMerge$`Age of Patient`)
sprintf("Mean of Age %.2f",dataMerge.Mean.Age)
## [1] "Mean of Age 42.16"
#Hold new Mean Data in data Frame
dataMerge.Mean.Table <- data.frame(dataMerge.Mean.Age,dataMerge.Mean.Time)
names(dataMerge.Mean.Table) <- c("Age","Time")
class(dataMerge.Mean.Table)
## [1] "data.frame"
head(dataMerge.Mean.Table)
## Age Time
## 1 42.15789 482.4962
#Median
dataMerge.Md.Time <- median(dataMerge$`Time Spent on Medicine`)
sprintf("Median of Time %.0f",dataMerge.Md.Time)
## [1] "Median of Time 210"
dataMerge.Md.Age <- median(dataMerge$`Age of Patient`)
sprintf("Median of Age %.0f",dataMerge.Md.Age)
## [1] "Median of Age 44"
#Hold new Median Data in data Frame
dataMerge.Md.Table <- data.frame(dataMerge.Md.Age,dataMerge.Md.Time)
names(dataMerge.Md.Table) <- c("Age","Time")
class(dataMerge.Md.Table)
## [1] "data.frame"
head(dataMerge.Md.Table)
## Age Time
## 1 44 210
Comapre the Two Mean and Median with Result of Question 1, which is stored in data frame “data.Mean.Time.Age” “data.Md.Dos.Age”
#Comapre the Two Mean and Median with Result of Question 1,
#which is stored in data frame "data.Mean.Time.Age" "data.Md.Dos.Age "
class(data.Mean.Time.Age) # Mean from Base data
## [1] "numeric"
class(data.Md.Time.Age) # Median from Base data
## [1] "list"
class(dataMerge.Mean.Table) # Mean from SUBSET data
## [1] "data.frame"
class(dataMerge.Md.Table) # Median from SUBSET data
## [1] "data.frame"
for (n in 1:length(data.Mean.Time.Age))
{
name <- names(data.Mean.Time.Age[n])
print(sprintf(" Mean of Base %s is %f", name ,data.Mean.Time.Age[n]))
print(sprintf(" Mean of New Subset %s is %f", names(dataMerge.Mean.Table[name]), dataMerge.Mean.Table[as.character(name)]))
if((identical(data.Mean.Time.Age[n],dataMerge.Mean.Table[n]))==TRUE){
print(sprintf("Mean of %s does match",name))
} else {
print(sprintf("Mean of %s does not match",name))
}
}
## [1] " Mean of Base Time is 15.455684"
## [1] " Mean of New Subset Time is 482.496241"
## [1] "Mean of Time does not match"
## [1] " Mean of Base Age is 42.363921"
## [1] " Mean of New Subset Age is 42.157895"
## [1] "Mean of Age does not match"
#................................Median of BASE AND SUBSET data
# Building Copy of Base Age Time data Median table
data.Md.Time.AgeCopy <- data.frame(as.numeric(data.Md.Time.Age$Age),as.integer(data.Md.Time.Age$Time))
names(data.Md.Time.AgeCopy) <- c("AgeC","TimeC")
dataMerge.Md.Table # Median for Subset data
## Age Time
## 1 44 210
data.Md.Time.AgeCopy # Copy of Base Age Time data Median table
## AgeC TimeC
## 1 44 12
# Using MAPPLY to aply function to each item of the vector
print("Medain of the Base and Subset Data Campare result as below, Here True indicates match FALSE indicates no Match")
## [1] "Medain of the Base and Subset Data Campare result as below, Here True indicates match FALSE indicates no Match"
mapply(function(x, y) {
# DO the Comapre of the same column value
# Replace value to FALSE if Its NA
value <- as.character(x) == as.character(y)
replace(value, is.na(value), FALSE)
},
dataMerge.Md.Table,
data.Md.Time.AgeCopy)
## Age Time
## TRUE FALSE
# Comapre using direct data frames
(dataMerge.Md.Table == data.Md.Time.AgeCopy) & !is.na(dataMerge.Md.Table) & !is.na(data.Md.Time.AgeCopy)
## Age Time
## [1,] TRUE FALSE
For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.
require(stringr)
head(dataMerge)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
class(dataMerge)
## [1] "data.frame"
dataMerge.New <- dataMerge
#Find all Age in between 0-29, 30-59, 60-Max
head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[0-2][0-9]"),])
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 3 3 2485 28 35.50000
## 12 12 628 24 22.42857
## 13 13 231 27 11.00000
## 14 14 120 21 10.00000
## 27 27 144 24 9.00000
## 38 38 946 28 22.00000
head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[3-5][0-9]"),])
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 4 4 574 33 17.39394
## 7 7 1596 42 28.50000
## 8 8 152 43 9.50000
## 9 9 1053 43 40.50000
head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[6-9][0-9]"),])
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 5 5 1830 63 30.5000
## 6 6 528 62 16.5000
## 21 21 171 60 9.5000
## 30 30 171 63 9.5000
## 53 53 -22 62 -0.6875
## 72 72 -28 60 -0.8000
dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[3-5][0-9]$",replacement = "Over Thirty")
dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[6-9][0-9]$",replacement = "Over Sixty")
dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[0-2][0-9]$",replacement = "Under Thirty")
dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New$`Age of Patient`,pattern = "Under Thirty",replacement ="MSDS")
head(dataMerge.New,200)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 Over Thirty 15.8000000
## 2 2 1428 Over Thirty 31.7333333
## 3 3 2485 MSDS 35.5000000
## 4 4 574 Over Thirty 17.3939394
## 5 5 1830 Over Sixty 30.5000000
## 6 6 528 Over Sixty 16.5000000
## 7 7 1596 Over Thirty 28.5000000
## 8 8 152 Over Thirty 9.5000000
## 9 9 1053 Over Thirty 40.5000000
## 10 10 888 Over Thirty 11.1000000
## 11 11 190 Over Thirty 10.0000000
## 12 12 628 MSDS 22.4285714
## 13 13 231 MSDS 11.0000000
## 14 14 120 MSDS 10.0000000
## 15 15 190 Over Thirty 10.0000000
## 16 16 231 Over Thirty 11.0000000
## 17 17 595 Over Thirty 17.5000000
## 18 18 300 Over Thirty 12.5000000
## 19 19 253 Over Thirty 11.5000000
## 20 20 351 Over Thirty 13.5000000
## 21 21 171 Over Sixty 9.5000000
## 22 22 4694 Over Thirty 51.5824176
## 23 23 190 Over Thirty 10.0000000
## 24 24 3140 Over Thirty 49.8412698
## 25 25 210 Over Thirty 10.5000000
## 26 26 -244 Over Thirty -5.4222222
## 27 27 144 MSDS 9.0000000
## 28 28 325 Over Thirty 13.0000000
## 29 29 171 Over Thirty 9.5000000
## 30 30 171 Over Sixty 9.5000000
## 31 31 190 Over Thirty 10.0000000
## 32 32 241 Over Thirty 11.4761905
## 33 33 250 Over Thirty 14.7058824
## 34 34 1081 Over Thirty 23.5000000
## 35 35 171 Over Thirty 9.5000000
## 36 36 171 Over Thirty 9.5000000
## 37 37 171 Over Thirty 9.5000000
## 38 38 946 MSDS 22.0000000
## 39 39 325 Over Thirty 13.0000000
## 40 40 372 Over Thirty 12.0000000
## 41 41 72 Over Thirty 4.5000000
## 42 42 674 MSDS 25.9230769
## 43 43 171 Over Thirty 9.5000000
## 44 44 171 Over Thirty 9.5000000
## 45 45 190 Over Thirty 10.0000000
## 46 46 301 Over Thirty 12.5416667
## 47 47 182 Over Thirty 6.7407407
## 48 48 277 Over Thirty 7.9142857
## 49 49 1540 Over Thirty 28.0000000
## 50 50 1128 Over Thirty 24.0000000
## 51 51 189 Over Thirty 10.5000000
## 52 52 263 Over Thirty 7.9696970
## 53 53 -22 Over Sixty -0.6875000
## 54 54 175 MSDS 10.9375000
## 55 55 4719 Over Thirty 39.0000000
## 56 56 153 Over Thirty 9.0000000
## 57 57 210 Over Thirty 10.5000000
## 58 58 831 Over Thirty 16.2941176
## 59 59 115 MSDS 3.3823529
## 60 60 142 Over Thirty 7.4736842
## 61 61 -84 Over Thirty -4.0000000
## 62 62 351 MSDS 13.5000000
## 63 63 88 Over Thirty 3.0344828
## 64 64 231 Over Thirty 10.5000000
## 65 65 1363 Over Thirty 26.7254902
## 66 66 217 Over Thirty 10.3333333
## 67 67 -167 Over Thirty -5.5666667
## 68 68 71 Over Thirty 6.4545455
## 69 69 156 Over Thirty 5.3793103
## 70 70 263 MSDS 6.9210526
## 71 71 97 Over Thirty 2.8529412
## 72 72 -28 Over Sixty -0.8000000
## 73 73 840 Over Sixty 17.8723404
## 74 74 1330 Over Thirty 17.7333333
## 75 75 126 Over Thirty 2.0322581
## 76 76 182 Over Thirty 8.2727273
## 77 77 210 Over Thirty 10.5000000
## 78 78 -171 Over Thirty -3.7173913
## 79 79 0 Over Thirty 0.0000000
## 80 80 600 Over Thirty 25.0000000
## 81 81 91 Over Thirty 7.0000000
## 82 82 119 Over Thirty 3.6060606
## 83 83 82 Over Thirty 6.8333333
## 84 84 154 Over Thirty 11.0000000
## 85 85 777 Over Thirty 14.3888889
## 86 86 -63 Over Thirty -1.5000000
## 87 87 153 Over Thirty 8.0526316
## 88 88 391 MSDS 13.9642857
## 89 89 39 Over Thirty 1.3448276
## 90 90 274 Over Thirty 9.1333333
## 91 91 190 Over Sixty 10.0000000
## 92 92 210 Over Thirty 10.5000000
## 93 93 20 Over Thirty 0.5128205
## 94 94 161 Over Thirty 6.1923077
## 95 95 -8 MSDS -0.6666667
## 96 96 -42 Over Thirty -1.5000000
## 97 97 264 Over Thirty 9.1034483
## 98 98 456 MSDS 9.7021277
## 99 99 190 Over Thirty 10.0000000
## 100 100 85 Over Thirty 2.5757576
## 101 101 594 MSDS 18.0000000
## 102 102 158 Over Thirty 9.2941176
## 103 103 161 Over Thirty 5.9629630
## 104 104 231 Over Thirty 11.0000000
## 105 105 125 Over Thirty 8.9285714
## 106 106 378 MSDS 14.0000000
## 107 107 70 Over Thirty 10.0000000
## 108 108 -223 Over Thirty -4.6458333
## 109 109 150 Over Thirty 6.2500000
## 110 110 1540 Over Thirty 28.0000000
## 111 111 465 Over Thirty 13.6764706
## 112 112 335 Over Thirty 8.8157895
## 113 113 78 Over Thirty 6.5000000
## 114 114 189 MSDS 7.8750000
## 115 115 210 Over Thirty 10.5000000
## 116 116 3015 Over Thirty 55.8333333
## 117 117 113 Over Thirty 5.3809524
## 118 118 88 MSDS 2.6666667
## 119 119 -133 Over Thirty -6.3333333
## 120 120 802 Over Thirty 10.6933333
## 121 121 444 Over Thirty 11.1000000
## 122 122 -146 Over Sixty -4.8666667
## 123 123 153 MSDS 9.0000000
## 124 124 56 Over Thirty 1.7500000
## 125 125 113 Over Thirty 6.2777778
## 126 126 2332 Over Thirty 39.5254237
## 127 127 20 Over Thirty 0.5000000
## 128 128 595 Over Thirty 17.5000000
## 129 129 714 MSDS 25.5000000
## 130 130 1239 Over Thirty 29.5000000
## 131 131 1153 Over Thirty 32.9428571
## 132 132 810 Over Thirty 40.5000000
## 133 133 815 MSDS 35.4347826
Display enough rows to see examples of all of steps 1-5 above.
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
#...........................................mean (Time and AGE)
names(data.Mean.Time.Age ) <- c("Time", "Age")
head(data.Mean.Time.Age )
## Time Age
## 15.45568 42.36392
#...........................................Meadion (Time and AGE)
head(data.Md.Time.Age)
## $Time
## [1] 12
##
## $Age
## [1] 44
head(dataID.sum.Time)
## ID Sum.Time
## 1 1 316
## 2 2 1428
## 3 3 2485
## 4 4 574
## 5 5 1830
## 6 6 528
head(dataID.mean.Time)
## ID Mean Time
## 1 1 15.80000
## 2 2 31.73333
## 3 3 35.50000
## 4 4 17.39394
## 5 5 30.50000
## 6 6 16.50000
head(dataID.age )
## ID AGE
## 1 1 30
## 2 2 36
## 3 3 28
## 4 4 33
## 5 5 63
## 6 6 62
head(dataCSV)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
head(dataCSV.COL.Sub.frame)
## Patient_ID Age Staus Sex
## 21 2 36 continuing female
## 22 2 36 continuing female
## 23 2 36 continuing female
## 24 2 36 continuing female
## 25 2 36 continuing female
## 26 2 36 continuing female
head(dataJoin)
## Patient ID Total Time Avg Time
## 1 1 316 15.80000
## 2 2 1428 31.73333
## 3 3 2485 35.50000
## 4 4 574 17.39394
## 5 5 1830 30.50000
## 6 6 528 16.50000
head(dataMerge)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
#Name
glimpse(NameDataCSV)
## Observations: 4,152
## Variables: 3
## $ a <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ b <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 790, ...
## $ c <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29, 30...
names(dataMerge) # New name of the columns
## [1] "P ID" "Time Spent on Medicine"
## [3] "Age of Patient" "Avg. Time"
# Mean of base and subset table
data.Mean.Time.Age
## Time Age
## 15.45568 42.36392
dataMerge.Mean.Table
## Age Time
## 1 42.15789 482.4962
# Median of base and subset table
data.Md.Time.AgeCopy
## AgeC TimeC
## 1 44 12
dataMerge.Md.Table
## Age Time
## 1 44 210
require(tidyverse)
glimpse(dataMerge)
## Observations: 133
## Variables: 4
## $ `P ID` <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
## $ `Time Spent on Medicine` <int> 316, 1428, 2485, 574, 1830, 528, 1596...
## $ `Age of Patient` <dbl> 30, 36, 28, 33, 63, 62, 42, 43, 43, 3...
## $ `Avg. Time` <dbl> 15.80000, 31.73333, 35.50000, 17.3939...
glimpse(dataMerge.New)
## Observations: 133
## Variables: 4
## $ `P ID` <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
## $ `Time Spent on Medicine` <int> 316, 1428, 2485, 574, 1830, 528, 1596...
## $ `Age of Patient` <chr> "Over Thirty", "Over Thirty", "MSDS",...
## $ `Avg. Time` <dbl> 15.80000, 31.73333, 35.50000, 17.3939...
head(dataMerge)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 30 15.80000
## 2 2 1428 36 31.73333
## 3 3 2485 28 35.50000
## 4 4 574 33 17.39394
## 5 5 1830 63 30.50000
## 6 6 528 62 16.50000
head(dataMerge.New)
## P ID Time Spent on Medicine Age of Patient Avg. Time
## 1 1 316 Over Thirty 15.80000
## 2 2 1428 Over Thirty 31.73333
## 3 3 2485 MSDS 35.50000
## 4 4 574 Over Thirty 17.39394
## 5 5 1830 Over Sixty 30.50000
## 6 6 528 Over Sixty 16.50000
BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.
Loading data from https://vincentarelbundock.github.io/Rdatasets/datasets.html
Uploaded to CSV Github , Raw CSV from GIT Hub
gitFile <- "https://github.com/Rajwantmishra/msds/blob/master/Migraine_Headaches.csv"
gitRawFile <- "https://raw.githubusercontent.com/Rajwantmishra/msds/master/Migraine_Headaches.csv"
#require(XML)
read.csv.url <- read.csv( url(gitRawFile))
head(read.csv.url)
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
#require(read.table)
read.csv.Data <- read.csv(gitRawFile,header=T)
head(read.csv.Data )
## X id time dos hatype age airq medication headache sex
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female
#library(data.table)
dataTableCSV <- fread(gitRawFile)
head(dataTableCSV)
## V1 id time dos hatype age airq medication headache sex
## 1: 1 1 -11 753 Aura 30 9 continuing yes female
## 2: 2 1 -10 754 Aura 30 7 continuing yes female
## 3: 3 1 -9 755 Aura 30 10 continuing yes female
## 4: 4 1 -8 756 Aura 30 13 continuing yes female
## 5: 5 1 -7 757 Aura 30 18 continuing yes female
## 6: 6 1 -6 758 Aura 30 19 continuing yes female
library(tidyverse)
tidyDataCSV <- read_csv(gitRawFile)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## id = col_double(),
## time = col_double(),
## dos = col_double(),
## hatype = col_character(),
## age = col_double(),
## airq = col_double(),
## medication = col_character(),
## headache = col_character(),
## sex = col_character()
## )
glimpse(tidyDataCSV)
## Observations: 4,152
## Variables: 10
## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ time <dbl> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 2...
## $ dos <dbl> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 7...
## $ hatype <chr> "Aura", "Aura", "Aura", "Aura", "Aura", "Aura", "Au...
## $ age <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,...
## $ airq <dbl> 9, 7, 10, 13, 18, 19, 17, 21, 21, 18, 14, 12, 22, 2...
## $ medication <chr> "continuing", "continuing", "continuing", "continui...
## $ headache <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "y...
## $ sex <chr> "female", "female", "female", "female", "female", "...
head(tidyDataCSV)
## # A tibble: 6 x 10
## X1 id time dos hatype age airq medication headache sex
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 1 -11 753 Aura 30 9 continuing yes female
## 2 2 1 -10 754 Aura 30 7 continuing yes female
## 3 3 1 -9 755 Aura 30 10 continuing yes female
## 4 4 1 -8 756 Aura 30 13 continuing yes female
## 5 5 1 -7 757 Aura 30 18 continuing yes female
## 6 6 1 -6 758 Aura 30 19 continuing yes female