Data Handling

IMPORTING DATA

URL data

library(RCurl)
covid_raw=getURL("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv")

covid.csv=read.csv(text=covid_raw)

covid_raw=getURL(“https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv”)

covid.csv=read.csv(text=covid_raw)

CSV data

1)Easy way read.csv(file.choose(),header = TRUE)

2)Path method read.csv(file=“path”,header=TRUE)

3)Table method ## Comma seperated read.table(file.choose(),header=TRUE,sep=“,”)

txt data

read.delim(file.choose(),header=TRUE)

2)Table method read.table(file.choose(),header=TRUE,sep=")

EXPORTING DATA

Saving in current working directory

write.table(data in environment,file=“maho.csv”,sep=“,”)

write.table(data in environment,file=“maho.csv”,sep=“,”,row.names=FALSE)

2)Saving in other than working directory

write.table(data in environment,file=“path”,sep=“,”,row.names=FALSE)

3)Saving csv file write.csv(data1,file=“path/name of the new file”,row.names = FALSE)

4)Saving txt file

write.csv(data1,file=“path/name of the new file”,row.names = FALSE, sep=" ")

Insight about the dataframe

Getting the dimensions of the data(rows and col)

dim(mtcars)

## [1] 32 11

To show the first six rows of the data

head(mtcars)

To show the last six rows of the data

tail(mtcars)

Subset the data

mtcars[c(3,4,7),]

Filtering by slicing

mtcars[c(5:9),]

Filtering by slicing excluding

mtcars[-c(5:9),]

Getting variable names in the data

names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

Subset rows using column values

dplyr package

Usage filter(.data, …, .preserve = FALSE)

Examples:

filter(dataset,columnname==“specific value”) filter(pickcount,PickCount==“924”)

Subsetting by column name

library(dplyr)

subset(dataframe,select=“colname”)

Subsetting by 0 and 1 df_name1<-filter(brandpreference,Primed? 1=Y==“1”) df_name2=filter(brandpreference,Primed? 1=Y==“0”)

Creating a data frame

dataset$Column[i:j] df_name1=df_name$Primed? 1=Y[21:42]

Working with data

Cleaning work space use rm

rm(list=ls())

# rm(name of the object)

To remove $ for everytime we refer a variable in dataset, use attach

mean(mtcars$mpg)

## [1] 20.09062

By using attach no longer need to use $

attach(mtcars)

mean(mpg)

## [1] 20.09062

To remove the attach use detach

detach(mtcars)

To see the datatype use class

attach(mtcars)

class(mpg)

## [1] "numeric"

To see the length of the data use length

length(mpg)

## [1] 32

To summarize the data

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

Set the variable as a categorical use as.factor()

x<-c(0,1,0,1,0,0,0,1,0,1)

x<-as.factor(x)

Set the variable as a numerical use as.factor

# You can convert TRUE,FALSE,TRUE .... vector to 1,0,0

a=c(TRUE,FALSE,TRUE)

as.numeric(a)

## [1] 1 0 1

Subsetting by categorical variable

mtcars["hp"]

mtcars[,c(1,3)]

mtcars[1:3,1:5]

a=mtcars["mpg">21]
a

Adding a column to the data set use cbind

conditions<- vs==0 & am==1

New_data<-cbind(mtcars,conditions)

New_data[1:5,]

Working Directory

Look at current working directory use getwd()

getwd()

## [1] "/Users/metuhead/Desktop/R"

Set the working directory

setwd(“exact path”)
setwd(“~missing path”) setwd(“~/Desktop/Midterm1”)
projectWD<-“/Users/metuhead/Desktop/FE 541- Applied Stat/Midterm1” setwd(projectWD)
Use the menu "Session/Set Working Directory

Save workspace image file

Use save.image save.image(“nameproject.Rdata”)
Use the menu Session/Save work space As
Clear workspace

rm(list=ls())

Loading work space image

load("nameproject.Rdata)

5)Loading the workspace image another way

load(file.choose())

6)Use the menu Session/Load Workspace

Rscript

1)To comment and uncomment all the lines in Rscript

Use the menu Code/Comment Uncomment Lines

2)Use tab key to complete commands

me and hit the tab it will show the suggestions ex: mean

Installing packages

1)Use install.packages

install.packages(“epiR”)

2)Then use library()

You must use library() for each session everytime

3)See all available packages

https://cran.r-project.org/

4)Menu Tools/Install Packages

Using the Apply Function

Apply functions are a set of loop functions in R

apply(X,MARGIN,FUN,….)

Example:

apply(X=mtcars,MARGIN=2,FUN=mean)

##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

Another way of finding column mean

colMeans(mtcars)

##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500

# just an example
# apply(X=mtcars,MARGIN=2,FUN=plot,type="l")

# plot(apply(X=mtcars,MARGIN=1,FUN=sum))

Removing NA value in apply

apply(X,MARGIN,FUN,na.rm=TRUE)

Using the Tapply Function

tapply can be used to apply a function to subsets of a variable or vector

tapply(X, INDEX, FUN = NULL, …, simplify = TRUE)

Example:

attach(chickwts)
tapply(X=weight,INDEX=feed,FUN=mean)

##    casein horsebean   linseed  meatmeal   soybean sunflower 
##  323.5833  160.2000  218.7500  276.9091  246.4286  328.9167

tapply(hp,vs,mean,simplfy=FALSE)

##         0         1 
## 189.72222  91.35714

Another way

mean(weight[feed=="horsebean"])

## [1] 160.2

tapply(X=hp,INDEX=list(vs,am),FUN=mean,simplfy=TRUE)

##          0         1
## 0 194.1667 180.83333
## 1 102.1429  80.57143

Another way

mean(hp[vs==0 & am==0])

## [1] 194.1667