This post is attempt to list most of the regular commands used in R through a simple project. In this project, we will analyze some fake data in Excel format. To clean the data, we have to remove all ‘NA’ records and duplicated rows, we will also add Date and caculated columns into the data. After we get the tidy data, we will do some basic aggregation, statistic and plotting analysis. If you want to see a pandas version, please refer to https://anaconda.org/dbsnail/analysis_data_with_python/notebook

To know more about how to load a .xlsx file, please refer to this post

Load data

We will use two fake datasets

#convert it into CSv file and load with read.table
health_data = read.table("C:/Users/jli/Documents/MyNotes/Projects/HealthData1.csv",sep = ',', head=TRUE)
head(health_data)
##    Name Height Weight Age Sex
## 1 Karen    162     62  34   F
## 2 Halen    173     73  43   F
## 3  Alen    156     52  45   M
## 4  John    178     62  35   M
## 5   Kim    165     70  45   F
## 6  Jean    167     75  39   F
#convert it into CSv file and load with read.table
health_data2 = read.table("C:/Users/jli/Documents/MyNotes/Projects/HealthData2.csv",sep = ',', na.strings = "NA", head=TRUE)
head(health_data2)
##    Name Height Weight Age VisitDate Sex
## 1 Halen    173     73  44  4/9/2016   F
## 2   Kun    180     90  60  4/1/2016   M
## 3   Tom    168     56  49  4/2/2016   M
## 4           NA     NA  NA              
## 5           NA     NA  NA              
## 6           NA     NA  NA

Explore Data

#show the first 5 rows 
head(health_data, n = 2)
##    Name Height Weight Age Sex
## 1 Karen    162     62  34   F
## 2 Halen    173     73  43   F
#Show all column names
colnames(health_data)
## [1] "Name"   "Height" "Weight" "Age"    "Sex"
#Show the average height
mean(health_data$Height)
## [1] 169.6364
#Show the maximal age
max(health_data$Age)
## [1] 54
#Show Karen's information
subset(health_data, health_data$Name == 'Karen')
##    Name Height Weight Age Sex
## 1 Karen    162     62  34   F
#Update Karen's Weight to 78 kg
health_data[health_data$Name=='Karen',]$Weight <-78

#Show Karen's information again
subset(health_data, health_data$Name == 'Karen')
##    Name Height Weight Age Sex
## 1 Karen    162     78  34   F
#Show the information of patients whose age is above 50
subset(health_data, health_data$Age >50)
##       Name Height Weight Age Sex
## 9  Kenkirk    178     75  54   M
## 10  George    176     72  52   M
#Show the Name and Height of patients whose Height is above 170
subset(health_data, health_data$Height >170, select = c('Name', 'Height'))
##       Name Height
## 2    Halen    173
## 4     John    178
## 9  Kenkirk    178
## 10  George    176
## 11   James    189
#Show Alen and John's information in two ways
health_data[health_data$Name %in% c("Alen", "John") , ]
##   Name Height Weight Age Sex
## 3 Alen    156     52  45   M
## 4 John    178     62  35   M
subset(health_data, health_data$Name %in% c("Alen", "John"))
##   Name Height Weight Age Sex
## 3 Alen    156     52  45   M
## 4 John    178     62  35   M
#Add one 'VisitDate' column
health_data$VisitDate <- as.POSIXct('02/20/2016', format = '%m/%d/%Y')

head(health_data, n = 2)
##    Name Height Weight Age Sex  VisitDate
## 1 Karen    162     78  34   F 2016-02-20
## 2 Halen    173     73  43   F 2016-02-20

Combine data

#remove all records with 'NA'
health_data2 <-health_data2[complete.cases(health_data2),]

#Make the VisitDate as dat format
health_data2$VisitDate = as.POSIXct(health_data2$VisitDate, format = '%m/%d/%Y')

#combine data 
df <- rbind(health_data, health_data2)
dim(df)
## [1] 14  6

Remove duplicated rows

Find duplicate rows by assuming no patient has same name, select the record with the most recent VisitDate if there are any duplicated rows.

#good for short list of Name
table(df$Name)  
## 
##    Alen  George   Halen   James    Jean    John   Karen Kenkirk     Kim 
##       1       1       2       1       1       1       1       1       1 
##    Kirk  Monica             Kun     Tom 
##       1       1       0       1       1
#good for long list of Name
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dup_name <- group_by(df, Name) %>% summarise(count = n() ) %>% filter(count > 1) 

#show no duplicate rows
list_no_dup<- subset(df, !(df$Name %in% dup_name$Name))
dim(list_no_dup)
## [1] 12  6
#return data without duplication
Name_Date<-group_by(df, Name) %>% summarise(VisitDate = max(VisitDate))
df<-merge(df, Name_Date)

#Show the data dimension
dim(df)
## [1] 13  6

Add a calculated column

#Adding a column "BMI" into the dataset
df$BMI <- df$Weight / (df$Height/100)^2

#Show BMI between 25 and 30
subset(df,  df$BMI > 25 & df$BMI <30)
##     Name  VisitDate Height Weight Age Sex      BMI
## 4  James 2016-02-20    189     90  30   M 25.19526
## 5   Jean 2016-02-20    167     75  39   F 26.89232
## 7  Karen 2016-02-20    162     78  34   F 29.72108
## 9    Kim 2016-02-20    165     70  45   F 25.71166
## 10  Kirk 2016-02-20    157     64  35   M 25.96454
## 11   Kun 2016-04-01    180     90  60   M 27.77778

Aggregate and Statistic Analysis

#Show total male and female patients
group_by(df, Sex) %>% summarise(count = n())
## Source: local data frame [2 x 2]
## 
##      Sex count
##   (fctr) (int)
## 1      F     5
## 2      M     8
#Show average BMI and its sd of male and female
group_by(df, Sex) %>% summarise(avg_BMI = mean(BMI), sd_BMI = sd(BMI))
## Source: local data frame [2 x 3]
## 
##      Sex  avg_BMI   sd_BMI
##   (fctr)    (dbl)    (dbl)
## 1      F 25.67748 2.980453
## 2      M 23.32871 2.939875

Plot the data

hist(df$BMI, main='Histogram of BMI', xlab= "BMI")

boxplot(BMI~Sex, data=df, ylab="BMI Weight(kg)/Height(m)^2", xlab="Gender")

plot(df$Age, df$BMI, xlab = "Age", ylab = "Calculated BMI", main ="Patient BMI", col="blue")