Data

Here is a list of data sets: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list) Please select one, download it and perform the following tasks:
Packages Used
  • Tidyverse
  • stringr
  • xml
  • data.table

  • Loading Packages

    
    install.packages("tidyverse")
    suppressWarnings(library(tidyverse))
    suppressWarnings(library(stringr))
    suppressWarnings(library(data.table))
    library(plyr)
    

    Question 1

    Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes

    Read File

    # Downloading the file for Location and Name it Migraine_Headaches.csv
    
    getwd() # Working Directory 
    ## [1] "C:/Users/951250/Documents/myR"
    file <- "https://vincentarelbundock.github.io/Rdatasets/csv/carData/KosteckiDillon.csv"
    download.file( url= file, destfile="data/Migraine_Headaches.csv" )
    dataCSVLocal <- read.csv(paste0(getwd(),"/data/Migraine_Headaches.csv"))
    
    head(dataCSVLocal)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    #Open it from Web
    dataCSV <- read.csv(file)
    identical(dataCSVLocal,dataCSV)
    ## [1] TRUE
    class(dataCSV)
    ## [1] "data.frame"

    Summary of Data

    Head & Tail

    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    tail(dataCSV)
    ##         X  id time  dos hatype age airq medication headache  sex
    ## 4147 4147 133   43 1231   Aura  26   19 continuing      yes male
    ## 4148 4148 133   44 1232   Aura  26   20 continuing      yes male
    ## 4149 4149 133   45 1233   Aura  26   27 continuing       no male
    ## 4150 4150 133   46 1234   Aura  26   28 continuing       no male
    ## 4151 4151 133   47 1235   Aura  26   33 continuing       no male
    ## 4152 4152 133   48 1236   Aura  26   25 continuing      yes male

    summary

    summary(dataCSV)
    ##        X              id              time             dos        
    ##  Min.   :   1   Min.   :  1.00   Min.   :-29.00   Min.   :  98.0  
    ##  1st Qu.:1039   1st Qu.: 33.00   1st Qu.:  3.00   1st Qu.: 384.0  
    ##  Median :2076   Median : 67.00   Median : 12.00   Median : 623.0  
    ##  Mean   :2076   Mean   : 66.39   Mean   : 15.46   Mean   : 646.7  
    ##  3rd Qu.:3114   3rd Qu.:100.00   3rd Qu.: 24.00   3rd Qu.: 950.0  
    ##  Max.   :4152   Max.   :133.00   Max.   : 99.00   Max.   :1239.0  
    ##      hatype          age             airq            medication  
    ##  Aura   :1710   Min.   :18.00   Min.   : 3.00   continuing:2386  
    ##  Mixed  : 457   1st Qu.:33.00   1st Qu.:18.00   none      : 785  
    ##  No Aura:1985   Median :44.00   Median :24.00   reduced   : 981  
    ##                 Mean   :42.36   Mean   :24.83                    
    ##                 3rd Qu.:50.00   3rd Qu.:29.00                    
    ##                 Max.   :66.00   Max.   :73.00                    
    ##  headache       sex      
    ##  no :1486   female:3545  
    ##  yes:2666   male  : 607  
    ##                          
    ##                          
    ##                          
    ## 

    Dimenssion

    dim(dataCSV)
    ## [1] 4152   10

    Structure

    str(dataCSV)
    ## 'data.frame':    4152 obs. of  10 variables:
    ##  $ X         : int  1 2 3 4 5 6 7 8 9 10 ...
    ##  $ id        : int  1 1 1 1 1 1 1 1 1 1 ...
    ##  $ time      : int  -11 -10 -9 -8 -7 -6 -5 22 23 24 ...
    ##  $ dos       : int  753 754 755 756 757 758 759 786 787 788 ...
    ##  $ hatype    : Factor w/ 3 levels "Aura","Mixed",..: 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ age       : int  30 30 30 30 30 30 30 30 30 30 ...
    ##  $ airq      : num  9 7 10 13 18 19 17 21 21 18 ...
    ##  $ medication: Factor w/ 3 levels "continuing","none",..: 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ headache  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
    ##  $ sex       : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...

    Rows

    nrow(dataCSV)
    ## [1] 4152

    Columns

    ncol(dataCSV)
    ## [1] 10

    Mean

    #...........................................mean  (Time and AGE)
    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    data.Mean.Time.Age <- sapply( list( dataCSV$time,dataCSV$age ), mean)
    names(data.Mean.Time.Age ) <- c("Time", "Age")
    
    for (n in 1:length(data.Mean.Time.Age))
    {
      print(sprintf(" Mean of %s is %f", names(data.Mean.Time.Age[n]) ,data.Mean.Time.Age[n]))
    }
    ## [1] " Mean of Time is 15.455684"
    ## [1] " Mean of Age is 42.363921"

    Median

    #...........................................Meadion (Time and AGE)
    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    data.Md.Time.Age <- lapply( list( "Time" = dataCSV$time,"Age" = dataCSV$age ), median)
    class(data.Md.Time.Age)
    ## [1] "list"
    length(data.Md.Time.Age)
    ## [1] 2
    data.Md.Time.Age
    ## $Time
    ## [1] 12
    ## 
    ## $Age
    ## [1] 44
    for (n in 1: length(data.Md.Time.Age))
    {
      print( paste("Median for", names(data.Md.Time.Age[n])," - ", data.Md.Time.Age[n]))
    } 
    ## [1] "Median for Time  -  12"
    ## [1] "Median for Age  -  44"

    Question 2

    Create a new data frame with a subset of the columns and rows. Make sure to rename it

    Solution 1

    Objective

    # Get all the Age greater than 30 from the dataset.
    # Create a subset of this data and put it in dataframe 

    Data

    #......................Using subset to create subset of column (Solution 1)
    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female

    Using Subset & Rename

    # Get id,age,dos the data for Age greater than 30 from the dataset.
    dataCSV.COL.Sub.Id_Age_Dos <- subset(dataCSV,  dataCSV$age > 30 , select = c(id,age,dos) )
    
    # Get id,age,medication,sex the data for Age greater than 30 from the dataset.
    dataCSV.COL.Sub.Id_Age_Med_Sex <- subset(dataCSV,  dataCSV$age > 30 , select = c(id,age,medication,sex) )
    
    
    # Creating new data Frame with some column from above subset data dataframe
    dataCSV.COL.Sub.frame <- data.frame(
      dataCSV.COL.Sub.Id_Age_Dos[c("id","age")],
      "Medicince Status" = dataCSV.COL.Sub.Id_Age_Med_Sex$medication,
      "Sex" = dataCSV.COL.Sub.Id_Age_Med_Sex$sex)
    
    head(dataCSV.COL.Sub.Id_Age_Dos)
    ##    id age dos
    ## 21  2  36 128
    ## 22  2  36 129
    ## 23  2  36 130
    ## 24  2  36 131
    ## 25  2  36 132
    ## 26  2  36 133
    head(dataCSV.COL.Sub.Id_Age_Med_Sex)
    ##    id age medication    sex
    ## 21  2  36 continuing female
    ## 22  2  36 continuing female
    ## 23  2  36 continuing female
    ## 24  2  36 continuing female
    ## 25  2  36 continuing female
    ## 26  2  36 continuing female
    head(dataCSV.COL.Sub.frame)
    ##    id age Medicince.Status    Sex
    ## 21  2  36       continuing female
    ## 22  2  36       continuing female
    ## 23  2  36       continuing female
    ## 24  2  36       continuing female
    ## 25  2  36       continuing female
    ## 26  2  36       continuing female
    # Renaming the Columns
    names(dataCSV.COL.Sub.frame) <- c("ID","Age","Medicine","Sex")
    head(dataCSV.COL.Sub.frame)
    ##    ID Age   Medicine    Sex
    ## 21  2  36 continuing female
    ## 22  2  36 continuing female
    ## 23  2  36 continuing female
    ## 24  2  36 continuing female
    ## 25  2  36 continuing female
    ## 26  2  36 continuing female
    #Rename using PLAYR
    require(plyr)
     head(dataCSV.COL.Sub.frame)
    ##    ID Age   Medicine    Sex
    ## 21  2  36 continuing female
    ## 22  2  36 continuing female
    ## 23  2  36 continuing female
    ## 24  2  36 continuing female
    ## 25  2  36 continuing female
    ## 26  2  36 continuing female
     dataCSV.COL.Sub.frame <- rename(dataCSV.COL.Sub.frame,c("ID"="Patient_ID", "Medicine"="Staus"))
     
      head(dataCSV.COL.Sub.frame)
    ##    Patient_ID Age      Staus    Sex
    ## 21          2  36 continuing female
    ## 22          2  36 continuing female
    ## 23          2  36 continuing female
    ## 24          2  36 continuing female
    ## 25          2  36 continuing female
    ## 26          2  36 continuing female

    Solution 2

    Using cbind and changing Names

    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    dataCSV.COL.1 <- dataCSV[c("id","dos")]
    names(dataCSV.COL.1) <- c( "ID" , "Dos")  # naming the Data set 
    dataCSV.COL.2 <- dataCSV$time 
    dataSubset <- cbind( dataCSV.COL.1, "Time" = dataCSV.COL.2)
    class(dataSubset)
    ## [1] "data.frame"
    class(dataSubset) # Data Frame 
    ## [1] "data.frame"
    head(dataSubset)  #Displaying data
    ##   ID Dos Time
    ## 1  1 753  -11
    ## 2  1 754  -10
    ## 3  1 755   -9
    ## 4  1 756   -8
    ## 5  1 757   -7
    ## 6  1 758   -6

    Solution 3

    #...........................................Using Join / Merge / aggregate  (Solution 3)
    
    dataID.sum.Time <- aggregate(list("Sum Time" = dataCSV$time ),by = list("ID" = dataCSV$id ) , sum)
    dataID.mean.Time <- aggregate( time ~ id  , data = dataCSV, mean)
    dataID.age <- aggregate(age ~ id ,  data = dataCSV, mean)
    names(dataID.age) <- c("ID", "AGE")
    names(dataID.mean.Time) <- c("ID", "Mean Time")
    head(dataID.sum.Time)
    ##   ID Sum.Time
    ## 1  1      316
    ## 2  2     1428
    ## 3  3     2485
    ## 4  4      574
    ## 5  5     1830
    ## 6  6      528
    head(dataID.mean.Time)
    ##   ID Mean Time
    ## 1  1  15.80000
    ## 2  2  31.73333
    ## 3  3  35.50000
    ## 4  4  17.39394
    ## 5  5  30.50000
    ## 6  6  16.50000
    head(dataID.age )
    ##   ID AGE
    ## 1  1  30
    ## 2  2  36
    ## 3  3  28
    ## 4  4  33
    ## 5  5  63
    ## 6  6  62
    # merging data of dataID.sum.Time to dataID.age 
    dataMergeAge <- merge(x=dataID.sum.Time, y= dataID.age, by.x = c("Patient ID" = "ID") , by.y = c("Patient ID" = "ID") )
    
    
    # merging data  dataMergeAge to dataID.mean.Time show Average time and Total Time taken by Each patient along with their AGE
    dataMerge <- merge(x=dataMergeAge, y= dataID.mean.Time, by.x = c("Patient ID" = "ID") , by.y = c("Patient ID" = "ID") )
    names(dataMerge) <- c("Patient ID","Total Time","Age","Avg Time")
    head(dataMerge)   # Soluiton 
    ##   Patient ID Total Time Age Avg Time
    ## 1          1        316  30 15.80000
    ## 2          2       1428  36 31.73333
    ## 3          3       2485  28 35.50000
    ## 4          4        574  33 17.39394
    ## 5          5       1830  63 30.50000
    ## 6          6        528  62 16.50000
    # Using Join from playr
    require(plyr)
    dataJoin <- join(x = dataID.sum.Time, y = dataID.mean.Time, by = c("ID"))
    names(dataJoin) <- c("Patient ID","Total Time","Avg Time")
    head(dataJoin)
    ##   Patient ID Total Time Avg Time
    ## 1          1        316 15.80000
    ## 2          2       1428 31.73333
    ## 3          3       2485 35.50000
    ## 4          4        574 17.39394
    ## 5          5       1830 30.50000
    ## 6          6        528 16.50000

    Solution 4

    #using setNames 
    glimpse(dataSubset)
    ## Observations: 4,152
    ## Variables: 3
    ## $ ID   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    ## $ Dos  <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 79...
    ## $ Time <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29,...
    NameDataCSV <- setNames(dataSubset,letters[1:length(dataSubset)])
    glimpse(NameDataCSV)
    ## Observations: 4,152
    ## Variables: 3
    ## $ a <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    ## $ b <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 790, ...
    ## $ c <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29, 30...

    Question 3

    Create new column names for the new data frame.

    Solution 1

    Using Plyr package

    head(dataMerge) # using Data from solution 3 of question 2
    ##   Patient ID Total Time Age Avg Time
    ## 1          1        316  30 15.80000
    ## 2          2       1428  36 31.73333
    ## 3          3       2485  28 35.50000
    ## 4          4        574  33 17.39394
    ## 5          5       1830  63 30.50000
    ## 6          6        528  62 16.50000
    names(dataMerge) # Know current name of the columns 
    ## [1] "Patient ID" "Total Time" "Age"        "Avg Time"
    class(dataMerge)  # making sure I am working with Data Frame
    ## [1] "data.frame"
    # Using Plyr package  
    #require(plyr)
    library(plyr)
    names(dataMerge) # Know  name before Change of the columns 
    ## [1] "Patient ID" "Total Time" "Age"        "Avg Time"
    dataMerge <- rename(dataMerge,c("Patient ID" = "P_ID", "Total Time" = "Medicine Time"))
    names(dataMerge) # New name of the columns
    ## [1] "P_ID"          "Medicine Time" "Age"           "Avg Time"

    Solution 2

    Using base R function

    head(dataMerge) # using Data from solution 2 of question 2
    ##   P_ID Medicine Time Age Avg Time
    ## 1    1           316  30 15.80000
    ## 2    2          1428  36 31.73333
    ## 3    3          2485  28 35.50000
    ## 4    4           574  33 17.39394
    ## 5    5          1830  63 30.50000
    ## 6    6           528  62 16.50000
    names(dataMerge) # Know current name of the columns 
    ## [1] "P_ID"          "Medicine Time" "Age"           "Avg Time"
    class(dataMerge)  # making sure I am working with Data Frame
    ## [1] "data.frame"
    # Creating New Name for Columns
    names(dataMerge) = c("P ID","Time Spent on Medicine", "Age of Patient","Avg. Time")
    names(dataMerge) # New name of the columns 
    ## [1] "P ID"                   "Time Spent on Medicine"
    ## [3] "Age of Patient"         "Avg. Time"

    Question 4

    Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

    Data Check

    head(dataMerge)
    ##   P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 1    1                    316             30  15.80000
    ## 2    2                   1428             36  31.73333
    ## 3    3                   2485             28  35.50000
    ## 4    4                    574             33  17.39394
    ## 5    5                   1830             63  30.50000
    ## 6    6                    528             62  16.50000

    Summary

    #.............................Using Summary
    summary(dataMerge)
    ##       P ID     Time Spent on Medicine Age of Patient    Avg. Time     
    ##  Min.   :  1   Min.   :-244.0         Min.   :18.00   Min.   :-6.333  
    ##  1st Qu.: 34   1st Qu.: 126.0         1st Qu.:33.00   1st Qu.: 6.741  
    ##  Median : 67   Median : 210.0         Median :44.00   Median :10.000  
    ##  Mean   : 67   Mean   : 482.5         Mean   :42.16   Mean   :12.571  
    ##  3rd Qu.:100   3rd Qu.: 574.0         3rd Qu.:50.00   3rd Qu.:14.706  
    ##  Max.   :133   Max.   :4719.0         Max.   :66.00   Max.   :55.833
    str(dataMerge)
    ## 'data.frame':    133 obs. of  4 variables:
    ##  $ P ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
    ##  $ Time Spent on Medicine: int  316 1428 2485 574 1830 528 1596 152 1053 888 ...
    ##  $ Age of Patient        : num  30 36 28 33 63 62 42 43 43 30 ...
    ##  $ Avg. Time             : num  15.8 31.7 35.5 17.4 30.5 ...
    names(dataMerge)
    ## [1] "P ID"                   "Time Spent on Medicine"
    ## [3] "Age of Patient"         "Avg. Time"
    summary(dataMerge,digits = 0,maxsum = 2)  # Disaplying No Decimals and Factors upto 2 if any
    ##       P ID     Time Spent on Medicine Age of Patient   Avg. Time 
    ##  Min.   :  1   Min.   :-244           Min.   :18     Min.   :-6  
    ##  1st Qu.: 34   1st Qu.: 126           1st Qu.:33     1st Qu.: 7  
    ##  Median : 67   Median : 210           Median :44     Median :10  
    ##  Mean   : 67   Mean   : 482           Mean   :42     Mean   :13  
    ##  3rd Qu.:100   3rd Qu.: 574           3rd Qu.:50     3rd Qu.:15  
    ##  Max.   :133   Max.   :4719           Max.   :66     Max.   :56
    #Disaply only 3 Age as factor  
    summary( as.factor(dataMerge$`Age`),maxsum=3)
    ##      46      43 (Other) 
    ##      11       7     115

    Mean

    #Mean
    dataMerge.Mean.Time <- mean(dataMerge$`Time Spent on Medicine`)
    sprintf("Mean of Time %.2f",dataMerge.Mean.Time)
    ## [1] "Mean of Time 482.50"
    dataMerge.Mean.Age <- mean(dataMerge$`Age of Patient`)
    sprintf("Mean of Age  %.2f",dataMerge.Mean.Age)
    ## [1] "Mean of Age  42.16"
    #Hold new Mean Data in data Frame 
    dataMerge.Mean.Table <- data.frame(dataMerge.Mean.Age,dataMerge.Mean.Time) 
    names(dataMerge.Mean.Table) <- c("Age","Time")
    class(dataMerge.Mean.Table)
    ## [1] "data.frame"
    head(dataMerge.Mean.Table)
    ##        Age     Time
    ## 1 42.15789 482.4962

    Median

    #Median
    dataMerge.Md.Time <- median(dataMerge$`Time Spent on Medicine`)
    sprintf("Median of Time  %.0f",dataMerge.Md.Time)
    ## [1] "Median of Time  210"
    dataMerge.Md.Age <- median(dataMerge$`Age of Patient`)
    sprintf("Median of Age %.0f",dataMerge.Md.Age)
    ## [1] "Median of Age 44"
    #Hold new Median Data in data Frame 
    dataMerge.Md.Table <- data.frame(dataMerge.Md.Age,dataMerge.Md.Time) 
    names(dataMerge.Md.Table) <- c("Age","Time")
    class(dataMerge.Md.Table)
    ## [1] "data.frame"
    head(dataMerge.Md.Table)
    ##   Age Time
    ## 1  44  210

    Compare

    Comapre the Two Mean and Median with Result of Question 1, which is stored in data frame “data.Mean.Time.Age” “data.Md.Dos.Age”

    Mean

    #Comapre the Two Mean and Median with Result of Question 1, 
    #which is stored in data frame "data.Mean.Time.Age" "data.Md.Dos.Age "
    
      class(data.Mean.Time.Age)                   # Mean from Base data 
    ## [1] "numeric"
      class(data.Md.Time.Age)                     # Median from Base data 
    ## [1] "list"
      class(dataMerge.Mean.Table)                 # Mean from SUBSET data
    ## [1] "data.frame"
      class(dataMerge.Md.Table)                   # Median from SUBSET data
    ## [1] "data.frame"
      for (n in 1:length(data.Mean.Time.Age))
    { 
        name <- names(data.Mean.Time.Age[n])
        print(sprintf(" Mean of Base %s is %f", name ,data.Mean.Time.Age[n]))
        print(sprintf(" Mean of New Subset %s is %f", names(dataMerge.Mean.Table[name]), dataMerge.Mean.Table[as.character(name)]))
        
        if((identical(data.Mean.Time.Age[n],dataMerge.Mean.Table[n]))==TRUE){
           print(sprintf("Mean of %s does match",name))
          
        } else {
          print(sprintf("Mean of %s does not match",name))
          
        }
      }
    ## [1] " Mean of Base Time is 15.455684"
    ## [1] " Mean of New Subset Time is 482.496241"
    ## [1] "Mean of Time does not match"
    ## [1] " Mean of Base Age is 42.363921"
    ## [1] " Mean of New Subset Age is 42.157895"
    ## [1] "Mean of Age does not match"

    Median

    #................................Median of BASE AND SUBSET data 
      
      # Building Copy of Base Age Time data Median table
      data.Md.Time.AgeCopy <- data.frame(as.numeric(data.Md.Time.Age$Age),as.integer(data.Md.Time.Age$Time))
      
      names(data.Md.Time.AgeCopy) <- c("AgeC","TimeC")
    
      
      dataMerge.Md.Table             # Median for Subset data 
    ##   Age Time
    ## 1  44  210
      data.Md.Time.AgeCopy          # Copy of Base Age Time data Median table
    ##   AgeC TimeC
    ## 1   44    12
      # Using MAPPLY to aply function to each item of the vector 
      print("Medain of the Base and Subset Data Campare result as below, Here True indicates match FALSE indicates no Match")
    ## [1] "Medain of the Base and Subset Data Campare result as below, Here True indicates match FALSE indicates no Match"
        mapply(function(x, y) {
          # DO the Comapre of the same column value 
          # Replace value to FALSE if Its NA
          value <- as.character(x) == as.character(y) 
          
             replace(value, is.na(value), FALSE)
             }, 
          dataMerge.Md.Table, 
          data.Md.Time.AgeCopy)
    ##   Age  Time 
    ##  TRUE FALSE
     # Comapre using direct data frames
        (dataMerge.Md.Table == data.Md.Time.AgeCopy) & !is.na(dataMerge.Md.Table) & !is.na(data.Md.Time.AgeCopy) 
    ##       Age  Time
    ## [1,] TRUE FALSE

    Question 5

    For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.

    require(stringr)
    head(dataMerge)
    ##   P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 1    1                    316             30  15.80000
    ## 2    2                   1428             36  31.73333
    ## 3    3                   2485             28  35.50000
    ## 4    4                    574             33  17.39394
    ## 5    5                   1830             63  30.50000
    ## 6    6                    528             62  16.50000
    class(dataMerge)
    ## [1] "data.frame"
    dataMerge.New <- dataMerge
    #Find all Age in between 0-29, 30-59, 60-Max
    head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[0-2][0-9]"),])
    ##    P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 3     3                   2485             28  35.50000
    ## 12   12                    628             24  22.42857
    ## 13   13                    231             27  11.00000
    ## 14   14                    120             21  10.00000
    ## 27   27                    144             24   9.00000
    ## 38   38                    946             28  22.00000
    head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[3-5][0-9]"),])
    ##   P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 1    1                    316             30  15.80000
    ## 2    2                   1428             36  31.73333
    ## 4    4                    574             33  17.39394
    ## 7    7                   1596             42  28.50000
    ## 8    8                    152             43   9.50000
    ## 9    9                   1053             43  40.50000
    head(dataMerge.New [str_detect(dataMerge.New $`Age of Patient`,pattern = "[6-9][0-9]"),])
    ##    P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 5     5                   1830             63   30.5000
    ## 6     6                    528             62   16.5000
    ## 21   21                    171             60    9.5000
    ## 30   30                    171             63    9.5000
    ## 53   53                    -22             62   -0.6875
    ## 72   72                    -28             60   -0.8000
    dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[3-5][0-9]$",replacement = "Over Thirty")
    
    dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[6-9][0-9]$",replacement = "Over Sixty")
    
    dataMerge.New $`Age of Patient` <- str_replace_all(dataMerge.New $`Age of Patient`,pattern = "^[0-2][0-9]$",replacement = "Under Thirty")
    
    dataMerge.New $`Age of Patient`  <- str_replace_all(dataMerge.New$`Age of Patient`,pattern = "Under Thirty",replacement ="MSDS")
    
    head(dataMerge.New,200)
    ##     P ID Time Spent on Medicine Age of Patient  Avg. Time
    ## 1      1                    316    Over Thirty 15.8000000
    ## 2      2                   1428    Over Thirty 31.7333333
    ## 3      3                   2485           MSDS 35.5000000
    ## 4      4                    574    Over Thirty 17.3939394
    ## 5      5                   1830     Over Sixty 30.5000000
    ## 6      6                    528     Over Sixty 16.5000000
    ## 7      7                   1596    Over Thirty 28.5000000
    ## 8      8                    152    Over Thirty  9.5000000
    ## 9      9                   1053    Over Thirty 40.5000000
    ## 10    10                    888    Over Thirty 11.1000000
    ## 11    11                    190    Over Thirty 10.0000000
    ## 12    12                    628           MSDS 22.4285714
    ## 13    13                    231           MSDS 11.0000000
    ## 14    14                    120           MSDS 10.0000000
    ## 15    15                    190    Over Thirty 10.0000000
    ## 16    16                    231    Over Thirty 11.0000000
    ## 17    17                    595    Over Thirty 17.5000000
    ## 18    18                    300    Over Thirty 12.5000000
    ## 19    19                    253    Over Thirty 11.5000000
    ## 20    20                    351    Over Thirty 13.5000000
    ## 21    21                    171     Over Sixty  9.5000000
    ## 22    22                   4694    Over Thirty 51.5824176
    ## 23    23                    190    Over Thirty 10.0000000
    ## 24    24                   3140    Over Thirty 49.8412698
    ## 25    25                    210    Over Thirty 10.5000000
    ## 26    26                   -244    Over Thirty -5.4222222
    ## 27    27                    144           MSDS  9.0000000
    ## 28    28                    325    Over Thirty 13.0000000
    ## 29    29                    171    Over Thirty  9.5000000
    ## 30    30                    171     Over Sixty  9.5000000
    ## 31    31                    190    Over Thirty 10.0000000
    ## 32    32                    241    Over Thirty 11.4761905
    ## 33    33                    250    Over Thirty 14.7058824
    ## 34    34                   1081    Over Thirty 23.5000000
    ## 35    35                    171    Over Thirty  9.5000000
    ## 36    36                    171    Over Thirty  9.5000000
    ## 37    37                    171    Over Thirty  9.5000000
    ## 38    38                    946           MSDS 22.0000000
    ## 39    39                    325    Over Thirty 13.0000000
    ## 40    40                    372    Over Thirty 12.0000000
    ## 41    41                     72    Over Thirty  4.5000000
    ## 42    42                    674           MSDS 25.9230769
    ## 43    43                    171    Over Thirty  9.5000000
    ## 44    44                    171    Over Thirty  9.5000000
    ## 45    45                    190    Over Thirty 10.0000000
    ## 46    46                    301    Over Thirty 12.5416667
    ## 47    47                    182    Over Thirty  6.7407407
    ## 48    48                    277    Over Thirty  7.9142857
    ## 49    49                   1540    Over Thirty 28.0000000
    ## 50    50                   1128    Over Thirty 24.0000000
    ## 51    51                    189    Over Thirty 10.5000000
    ## 52    52                    263    Over Thirty  7.9696970
    ## 53    53                    -22     Over Sixty -0.6875000
    ## 54    54                    175           MSDS 10.9375000
    ## 55    55                   4719    Over Thirty 39.0000000
    ## 56    56                    153    Over Thirty  9.0000000
    ## 57    57                    210    Over Thirty 10.5000000
    ## 58    58                    831    Over Thirty 16.2941176
    ## 59    59                    115           MSDS  3.3823529
    ## 60    60                    142    Over Thirty  7.4736842
    ## 61    61                    -84    Over Thirty -4.0000000
    ## 62    62                    351           MSDS 13.5000000
    ## 63    63                     88    Over Thirty  3.0344828
    ## 64    64                    231    Over Thirty 10.5000000
    ## 65    65                   1363    Over Thirty 26.7254902
    ## 66    66                    217    Over Thirty 10.3333333
    ## 67    67                   -167    Over Thirty -5.5666667
    ## 68    68                     71    Over Thirty  6.4545455
    ## 69    69                    156    Over Thirty  5.3793103
    ## 70    70                    263           MSDS  6.9210526
    ## 71    71                     97    Over Thirty  2.8529412
    ## 72    72                    -28     Over Sixty -0.8000000
    ## 73    73                    840     Over Sixty 17.8723404
    ## 74    74                   1330    Over Thirty 17.7333333
    ## 75    75                    126    Over Thirty  2.0322581
    ## 76    76                    182    Over Thirty  8.2727273
    ## 77    77                    210    Over Thirty 10.5000000
    ## 78    78                   -171    Over Thirty -3.7173913
    ## 79    79                      0    Over Thirty  0.0000000
    ## 80    80                    600    Over Thirty 25.0000000
    ## 81    81                     91    Over Thirty  7.0000000
    ## 82    82                    119    Over Thirty  3.6060606
    ## 83    83                     82    Over Thirty  6.8333333
    ## 84    84                    154    Over Thirty 11.0000000
    ## 85    85                    777    Over Thirty 14.3888889
    ## 86    86                    -63    Over Thirty -1.5000000
    ## 87    87                    153    Over Thirty  8.0526316
    ## 88    88                    391           MSDS 13.9642857
    ## 89    89                     39    Over Thirty  1.3448276
    ## 90    90                    274    Over Thirty  9.1333333
    ## 91    91                    190     Over Sixty 10.0000000
    ## 92    92                    210    Over Thirty 10.5000000
    ## 93    93                     20    Over Thirty  0.5128205
    ## 94    94                    161    Over Thirty  6.1923077
    ## 95    95                     -8           MSDS -0.6666667
    ## 96    96                    -42    Over Thirty -1.5000000
    ## 97    97                    264    Over Thirty  9.1034483
    ## 98    98                    456           MSDS  9.7021277
    ## 99    99                    190    Over Thirty 10.0000000
    ## 100  100                     85    Over Thirty  2.5757576
    ## 101  101                    594           MSDS 18.0000000
    ## 102  102                    158    Over Thirty  9.2941176
    ## 103  103                    161    Over Thirty  5.9629630
    ## 104  104                    231    Over Thirty 11.0000000
    ## 105  105                    125    Over Thirty  8.9285714
    ## 106  106                    378           MSDS 14.0000000
    ## 107  107                     70    Over Thirty 10.0000000
    ## 108  108                   -223    Over Thirty -4.6458333
    ## 109  109                    150    Over Thirty  6.2500000
    ## 110  110                   1540    Over Thirty 28.0000000
    ## 111  111                    465    Over Thirty 13.6764706
    ## 112  112                    335    Over Thirty  8.8157895
    ## 113  113                     78    Over Thirty  6.5000000
    ## 114  114                    189           MSDS  7.8750000
    ## 115  115                    210    Over Thirty 10.5000000
    ## 116  116                   3015    Over Thirty 55.8333333
    ## 117  117                    113    Over Thirty  5.3809524
    ## 118  118                     88           MSDS  2.6666667
    ## 119  119                   -133    Over Thirty -6.3333333
    ## 120  120                    802    Over Thirty 10.6933333
    ## 121  121                    444    Over Thirty 11.1000000
    ## 122  122                   -146     Over Sixty -4.8666667
    ## 123  123                    153           MSDS  9.0000000
    ## 124  124                     56    Over Thirty  1.7500000
    ## 125  125                    113    Over Thirty  6.2777778
    ## 126  126                   2332    Over Thirty 39.5254237
    ## 127  127                     20    Over Thirty  0.5000000
    ## 128  128                    595    Over Thirty 17.5000000
    ## 129  129                    714           MSDS 25.5000000
    ## 130  130                   1239    Over Thirty 29.5000000
    ## 131  131                   1153    Over Thirty 32.9428571
    ## 132  132                    810    Over Thirty 40.5000000
    ## 133  133                    815           MSDS 35.4347826

    Question 6

    Display enough rows to see examples of all of steps 1-5 above.

    Q1

    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    #...........................................mean  (Time and AGE)
    names(data.Mean.Time.Age ) <- c("Time", "Age")
    head(data.Mean.Time.Age )
    ##     Time      Age 
    ## 15.45568 42.36392
    #...........................................Meadion (Time and AGE)
    head(data.Md.Time.Age)
    ## $Time
    ## [1] 12
    ## 
    ## $Age
    ## [1] 44

    Q2

    head(dataID.sum.Time)
    ##   ID Sum.Time
    ## 1  1      316
    ## 2  2     1428
    ## 3  3     2485
    ## 4  4      574
    ## 5  5     1830
    ## 6  6      528
    head(dataID.mean.Time)
    ##   ID Mean Time
    ## 1  1  15.80000
    ## 2  2  31.73333
    ## 3  3  35.50000
    ## 4  4  17.39394
    ## 5  5  30.50000
    ## 6  6  16.50000
    head(dataID.age )
    ##   ID AGE
    ## 1  1  30
    ## 2  2  36
    ## 3  3  28
    ## 4  4  33
    ## 5  5  63
    ## 6  6  62
    head(dataCSV)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    head(dataCSV.COL.Sub.frame)
    ##    Patient_ID Age      Staus    Sex
    ## 21          2  36 continuing female
    ## 22          2  36 continuing female
    ## 23          2  36 continuing female
    ## 24          2  36 continuing female
    ## 25          2  36 continuing female
    ## 26          2  36 continuing female
    head(dataJoin)
    ##   Patient ID Total Time Avg Time
    ## 1          1        316 15.80000
    ## 2          2       1428 31.73333
    ## 3          3       2485 35.50000
    ## 4          4        574 17.39394
    ## 5          5       1830 30.50000
    ## 6          6        528 16.50000
    head(dataMerge) 
    ##   P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 1    1                    316             30  15.80000
    ## 2    2                   1428             36  31.73333
    ## 3    3                   2485             28  35.50000
    ## 4    4                    574             33  17.39394
    ## 5    5                   1830             63  30.50000
    ## 6    6                    528             62  16.50000
    #Name
    glimpse(NameDataCSV)
    ## Observations: 4,152
    ## Variables: 3
    ## $ a <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    ## $ b <int> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 789, 790, ...
    ## $ c <int> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 28, 29, 30...

    Q3

    names(dataMerge) # New name of the columns
    ## [1] "P ID"                   "Time Spent on Medicine"
    ## [3] "Age of Patient"         "Avg. Time"

    Q4

    # Mean of  base and subset table 
    data.Mean.Time.Age
    ##     Time      Age 
    ## 15.45568 42.36392
    dataMerge.Mean.Table
    ##        Age     Time
    ## 1 42.15789 482.4962
    # Median of  base and subset table 
    data.Md.Time.AgeCopy
    ##   AgeC TimeC
    ## 1   44    12
    dataMerge.Md.Table 
    ##   Age Time
    ## 1  44  210

    Q5

    require(tidyverse)
    glimpse(dataMerge)
    ## Observations: 133
    ## Variables: 4
    ## $ `P ID`                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
    ## $ `Time Spent on Medicine` <int> 316, 1428, 2485, 574, 1830, 528, 1596...
    ## $ `Age of Patient`         <dbl> 30, 36, 28, 33, 63, 62, 42, 43, 43, 3...
    ## $ `Avg. Time`              <dbl> 15.80000, 31.73333, 35.50000, 17.3939...
    glimpse(dataMerge.New)
    ## Observations: 133
    ## Variables: 4
    ## $ `P ID`                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...
    ## $ `Time Spent on Medicine` <int> 316, 1428, 2485, 574, 1830, 528, 1596...
    ## $ `Age of Patient`         <chr> "Over Thirty", "Over Thirty", "MSDS",...
    ## $ `Avg. Time`              <dbl> 15.80000, 31.73333, 35.50000, 17.3939...
    head(dataMerge)
    ##   P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 1    1                    316             30  15.80000
    ## 2    2                   1428             36  31.73333
    ## 3    3                   2485             28  35.50000
    ## 4    4                    574             33  17.39394
    ## 5    5                   1830             63  30.50000
    ## 6    6                    528             62  16.50000
    head(dataMerge.New)
    ##   P ID Time Spent on Medicine Age of Patient Avg. Time
    ## 1    1                    316    Over Thirty  15.80000
    ## 2    2                   1428    Over Thirty  31.73333
    ## 3    3                   2485           MSDS  35.50000
    ## 4    4                    574    Over Thirty  17.39394
    ## 5    5                   1830     Over Sixty  30.50000
    ## 6    6                    528     Over Sixty  16.50000

    Question 7

    BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

    Loading data from https://vincentarelbundock.github.io/Rdatasets/datasets.html

    Uploaded to CSV Github , Raw CSV from GIT Hub

    gitFile <- "https://github.com/Rajwantmishra/msds/blob/master/Migraine_Headaches.csv"
    gitRawFile <- "https://raw.githubusercontent.com/Rajwantmishra/msds/master/Migraine_Headaches.csv"
    
    
    #require(XML)
    read.csv.url <- read.csv( url(gitRawFile))
    head(read.csv.url)
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    #require(read.table)
    read.csv.Data <- read.csv(gitRawFile,header=T)
    head(read.csv.Data )
    ##   X id time dos hatype age airq medication headache    sex
    ## 1 1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2 2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3 3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4 4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5 5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6 6  1   -6 758   Aura  30   19 continuing      yes female
    #library(data.table)
    dataTableCSV <- fread(gitRawFile)
    head(dataTableCSV)
    ##    V1 id time dos hatype age airq medication headache    sex
    ## 1:  1  1  -11 753   Aura  30    9 continuing      yes female
    ## 2:  2  1  -10 754   Aura  30    7 continuing      yes female
    ## 3:  3  1   -9 755   Aura  30   10 continuing      yes female
    ## 4:  4  1   -8 756   Aura  30   13 continuing      yes female
    ## 5:  5  1   -7 757   Aura  30   18 continuing      yes female
    ## 6:  6  1   -6 758   Aura  30   19 continuing      yes female
    library(tidyverse)
    
    tidyDataCSV <- read_csv(gitRawFile)
    ## Warning: Missing column names filled in: 'X1' [1]
    ## Parsed with column specification:
    ## cols(
    ##   X1 = col_double(),
    ##   id = col_double(),
    ##   time = col_double(),
    ##   dos = col_double(),
    ##   hatype = col_character(),
    ##   age = col_double(),
    ##   airq = col_double(),
    ##   medication = col_character(),
    ##   headache = col_character(),
    ##   sex = col_character()
    ## )
    glimpse(tidyDataCSV)
    ## Observations: 4,152
    ## Variables: 10
    ## $ X1         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
    ## $ id         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
    ## $ time       <dbl> -11, -10, -9, -8, -7, -6, -5, 22, 23, 24, 25, 26, 2...
    ## $ dos        <dbl> 753, 754, 755, 756, 757, 758, 759, 786, 787, 788, 7...
    ## $ hatype     <chr> "Aura", "Aura", "Aura", "Aura", "Aura", "Aura", "Au...
    ## $ age        <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,...
    ## $ airq       <dbl> 9, 7, 10, 13, 18, 19, 17, 21, 21, 18, 14, 12, 22, 2...
    ## $ medication <chr> "continuing", "continuing", "continuing", "continui...
    ## $ headache   <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "y...
    ## $ sex        <chr> "female", "female", "female", "female", "female", "...
    head(tidyDataCSV)
    ## # A tibble: 6 x 10
    ##      X1    id  time   dos hatype   age  airq medication headache sex   
    ##   <dbl> <dbl> <dbl> <dbl> <chr>  <dbl> <dbl> <chr>      <chr>    <chr> 
    ## 1     1     1   -11   753 Aura      30     9 continuing yes      female
    ## 2     2     1   -10   754 Aura      30     7 continuing yes      female
    ## 3     3     1    -9   755 Aura      30    10 continuing yes      female
    ## 4     4     1    -8   756 Aura      30    13 continuing yes      female
    ## 5     5     1    -7   757 Aura      30    18 continuing yes      female
    ## 6     6     1    -6   758 Aura      30    19 continuing yes      female