Data Analysis using R

Week1 - Day 1 material

———————————————————————–

#numeric Data Type
age=24
age

## [1] 24

class(age)

## [1] "numeric"

typeof(age)

## [1] "double"

#integer Data Type
age=24L
age

## [1] 24

class(age)

## [1] "integer"

typeof(age)

## [1] "integer"

#logical Data Type
status=TRUE
status

## [1] TRUE

class(status)

## [1] "logical"

typeof(status)

## [1] "logical"

#Convert numeric type to integer
ageInt=as.integer(age)
ageInt

## [1] 24

class(ageInt)

## [1] "integer"

typeof(ageInt)

## [1] "integer"

#Character Data Type
age="24.5"
age

## [1] "24.5"

class(age)

## [1] "character"

typeof(age)

## [1] "character"

#Creating vector using numeric data type
Age=c(10,20,40.5,60)
Age

## [1] 10.0 20.0 40.5 60.0

class(Age)

## [1] "numeric"

typeof(Age)

## [1] "double"

#Creating vector using character data type
state=c("WB","Delhi","Rajasthan")
state

## [1] "WB"        "Delhi"     "Rajasthan"

class(state)

## [1] "character"

typeof(state)

## [1] "character"

#Creating vector using logical data type
status=c(TRUE,FALSE,FALSE,TRUE)
status

## [1]  TRUE FALSE FALSE  TRUE

class(status)

## [1] "logical"

typeof(status)

## [1] "logical"

#Vector arithmatic
Age=c(10,20,40.5,60)
x=Age/5
x

## [1]  2.0  4.0  8.1 12.0

y=2*Age+5
y

## [1]  25  45  86 125

z=c(x,3)
z

## [1]  2.0  4.0  8.1 12.0  3.0

z=c(x,3,x)
z

## [1]  2.0  4.0  8.1 12.0  3.0  2.0  4.0  8.1 12.0

mean_age=sum(Age)/length(Age)
mean_age

## [1] 32.625

mean_age=mean(Age)
mean_age

## [1] 32.625

var_age=sum((Age-mean_age)^2/(length(Age)-1))
var_age

## [1] 494.2292

var_age=var(Age)
var_age

## [1] 494.2292

#Generating vector using functions
x=1:30
x

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30

x=seq(1,20,by=2)
x

##  [1]  1  3  5  7  9 11 13 15 17 19

x=seq(0,20,length.out=5)
x

## [1]  0  5 10 15 20

x=rep(Age,5)
x

##  [1] 10.0 20.0 40.5 60.0 10.0 20.0 40.5 60.0 10.0 20.0 40.5 60.0 10.0 20.0 40.5
## [16] 60.0 10.0 20.0 40.5 60.0

young=Age<=18
young

## [1]  TRUE FALSE FALSE FALSE

x=rep(c("A","B"),5)
x=paste(c("A","B"),1:5,sep="")
x

## [1] "A1" "B2" "A3" "B4" "A5"

#Indexing Vector
Age=c(47,61,72,43,55,53,50,NA )
#Logical Indexing
Age[(!is.na(Age)) & Age>50]

## [1] 61 72 55 53

Age[is.na(Age)]=0
Age

## [1] 47 61 72 43 55 53 50  0

#Indexing by +ve integer
Age[1]

## [1] 47

Age[2:5]

## [1] 61 72 43 55

#Indexing by -ve integer
Age[-5]

## [1] 47 61 72 43 53 50  0

Age[-(2:5)]

## [1] 47 53 50  0

#Matrix
policies=matrix(c(4,5,6,9,3,12),nrow=2,ncol=3)
policies

##      [,1] [,2] [,3]
## [1,]    4    6    3
## [2,]    5    9   12

policies=matrix(c(4,5,6,9,3,12),nrow=2,ncol=3,byrow=TRUE)
policies

##      [,1] [,2] [,3]
## [1,]    4    5    6
## [2,]    9    3   12

policies[2,2]

## [1] 3

#List
lst=list(name="NSOU",Courses=c("UG","PG","Phd"),no_of_learners=300000)
lst

## $name
## [1] "NSOU"
## 
## $Courses
## [1] "UG"  "PG"  "Phd"
## 
## $no_of_learners
## [1] 3e+05

lst$name

## [1] "NSOU"

lst$no_of_learners

## [1] 3e+05

lst$Courses[1]

## [1] "UG"

lst[[2]]

## [1] "UG"  "PG"  "Phd"

lst[[2]][2]

## [1] "PG"

## add names to the rows and columns of the matrix
dimnames(policies)

## NULL

policies

##      [,1] [,2] [,3]
## [1,]    4    5    6
## [2,]    9    3   12

dimnames(policies)=list(c("2013","2014"),c("Medical","Recreational","Both"))
#Factors
states=c('st1', 'st2', 'st3', 'st4','st5',"st6")
stateF=factor(states)
policies_13_14=c("Medical","Recreational","Both","Both","Both","Both")
class(policies_13_14)

## [1] "character"

policiesF=factor(policies_13_14)
policiesF

## [1] Medical      Recreational Both         Both         Both        
## [6] Both        
## Levels: Both Medical Recreational

class(stateF)

## [1] "factor"

levels(policiesF)

## [1] "Both"         "Medical"      "Recreational"

tapply(states,policiesF,length)

##         Both      Medical Recreational 
##            4            1            1

#DataFrame
legalYr=c("2013","2014","2013","2013","2014","2014")
legalYrF=factor(legalYr)
ounceLim=c(1,2,3.5,4,4.5,6)
marData=data.frame(stateF,policiesF,legalYrF,ounceLim)
summary(marData)

##  stateF         policiesF legalYrF    ounceLim    
##  st1:1   Both        :4   2013:3   Min.   :1.000  
##  st2:1   Medical     :1   2014:3   1st Qu.:2.375  
##  st3:1   Recreational:1            Median :3.750  
##  st4:1                             Mean   :3.500  
##  st5:1                             3rd Qu.:4.375  
##  st6:1                             Max.   :6.000

#Remove the environment variable
rm(list=ls())
#Set the working directory
dataFdr="D:\\D Drive\\Certificate Course\\data"
filename="legal_weed_age_GSS2016_ch1.csv"
dataFile=paste(dataFdr,filename,sep="\\")
dataFile

## [1] "D:\\D Drive\\Certificate Course\\data\\legal_weed_age_GSS2016_ch1.csv"

library(readr)
data1=read_csv(dataFile)

## Rows: 2867 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): grass, age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data1$age=as.numeric(data1$age)

## Warning: NAs introduced by coercion

data1$grass=as.factor(data1$grass)
summary(data1)

##        grass           age       
##  DK       : 110   Min.   :18.00  
##  IAP      : 911   1st Qu.:34.00  
##  LEGAL    :1126   Median :49.00  
##  NOT LEGAL: 717   Mean   :48.85  
##  NA's     :   3   3rd Qu.:62.00  
##                   Max.   :88.00  
##                   NA's   :32