Data Analysis using R

Week1 - Day 1 material

———————————————————————–

#numeric Data Type
age=24
age
## [1] 24
class(age)
## [1] "numeric"
typeof(age)
## [1] "double"
#integer Data Type
age=24L
age
## [1] 24
class(age)
## [1] "integer"
typeof(age)
## [1] "integer"
#logical Data Type
status=TRUE
status
## [1] TRUE
class(status)
## [1] "logical"
typeof(status)
## [1] "logical"
#Convert numeric type to integer
ageInt=as.integer(age)
ageInt
## [1] 24
class(ageInt)
## [1] "integer"
typeof(ageInt)
## [1] "integer"
#Character Data Type
age="24.5"
age
## [1] "24.5"
class(age)
## [1] "character"
typeof(age)
## [1] "character"
#Creating vector using numeric data type
Age=c(10,20,40.5,60)
Age
## [1] 10.0 20.0 40.5 60.0
class(Age)
## [1] "numeric"
typeof(Age)
## [1] "double"
#Creating vector using character data type
state=c("WB","Delhi","Rajasthan")
state
## [1] "WB"        "Delhi"     "Rajasthan"
class(state)
## [1] "character"
typeof(state)
## [1] "character"
#Creating vector using logical data type
status=c(TRUE,FALSE,FALSE,TRUE)
status
## [1]  TRUE FALSE FALSE  TRUE
class(status)
## [1] "logical"
typeof(status)
## [1] "logical"
#Vector arithmatic
Age=c(10,20,40.5,60)
x=Age/5
x
## [1]  2.0  4.0  8.1 12.0
y=2*Age+5
y
## [1]  25  45  86 125
z=c(x,3)
z
## [1]  2.0  4.0  8.1 12.0  3.0
z=c(x,3,x)
z
## [1]  2.0  4.0  8.1 12.0  3.0  2.0  4.0  8.1 12.0
mean_age=sum(Age)/length(Age)
mean_age
## [1] 32.625
mean_age=mean(Age)
mean_age
## [1] 32.625
var_age=sum((Age-mean_age)^2/(length(Age)-1))
var_age
## [1] 494.2292
var_age=var(Age)
var_age
## [1] 494.2292
#Generating vector using functions
x=1:30
x
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30
x=seq(1,20,by=2)
x
##  [1]  1  3  5  7  9 11 13 15 17 19
x=seq(0,20,length.out=5)
x
## [1]  0  5 10 15 20
x=rep(Age,5)
x
##  [1] 10.0 20.0 40.5 60.0 10.0 20.0 40.5 60.0 10.0 20.0 40.5 60.0 10.0 20.0 40.5
## [16] 60.0 10.0 20.0 40.5 60.0
young=Age<=18
young
## [1]  TRUE FALSE FALSE FALSE
x=rep(c("A","B"),5)
x=paste(c("A","B"),1:5,sep="")
x
## [1] "A1" "B2" "A3" "B4" "A5"
#Indexing Vector
Age=c(47,61,72,43,55,53,50,NA )
#Logical Indexing
Age[(!is.na(Age)) & Age>50]
## [1] 61 72 55 53
Age[is.na(Age)]=0
Age
## [1] 47 61 72 43 55 53 50  0
#Indexing by +ve integer
Age[1]
## [1] 47
Age[2:5]
## [1] 61 72 43 55
#Indexing by -ve integer
Age[-5]
## [1] 47 61 72 43 53 50  0
Age[-(2:5)]
## [1] 47 53 50  0
#Matrix
policies=matrix(c(4,5,6,9,3,12),nrow=2,ncol=3)
policies
##      [,1] [,2] [,3]
## [1,]    4    6    3
## [2,]    5    9   12
policies=matrix(c(4,5,6,9,3,12),nrow=2,ncol=3,byrow=TRUE)
policies
##      [,1] [,2] [,3]
## [1,]    4    5    6
## [2,]    9    3   12
policies[2,2]
## [1] 3
#List
lst=list(name="NSOU",Courses=c("UG","PG","Phd"),no_of_learners=300000)
lst
## $name
## [1] "NSOU"
## 
## $Courses
## [1] "UG"  "PG"  "Phd"
## 
## $no_of_learners
## [1] 3e+05
lst$name
## [1] "NSOU"
lst$no_of_learners
## [1] 3e+05
lst$Courses[1]
## [1] "UG"
lst[[2]]
## [1] "UG"  "PG"  "Phd"
lst[[2]][2]
## [1] "PG"
## add names to the rows and columns of the matrix
dimnames(policies)
## NULL
policies
##      [,1] [,2] [,3]
## [1,]    4    5    6
## [2,]    9    3   12
dimnames(policies)=list(c("2013","2014"),c("Medical","Recreational","Both"))
#Factors
states=c('st1', 'st2', 'st3', 'st4','st5',"st6")
stateF=factor(states)
policies_13_14=c("Medical","Recreational","Both","Both","Both","Both")
class(policies_13_14)
## [1] "character"
policiesF=factor(policies_13_14)
policiesF
## [1] Medical      Recreational Both         Both         Both        
## [6] Both        
## Levels: Both Medical Recreational
class(stateF)
## [1] "factor"
levels(policiesF)
## [1] "Both"         "Medical"      "Recreational"
tapply(states,policiesF,length)
##         Both      Medical Recreational 
##            4            1            1
#DataFrame
legalYr=c("2013","2014","2013","2013","2014","2014")
legalYrF=factor(legalYr)
ounceLim=c(1,2,3.5,4,4.5,6)
marData=data.frame(stateF,policiesF,legalYrF,ounceLim)
summary(marData)
##  stateF         policiesF legalYrF    ounceLim    
##  st1:1   Both        :4   2013:3   Min.   :1.000  
##  st2:1   Medical     :1   2014:3   1st Qu.:2.375  
##  st3:1   Recreational:1            Median :3.750  
##  st4:1                             Mean   :3.500  
##  st5:1                             3rd Qu.:4.375  
##  st6:1                             Max.   :6.000
#Remove the environment variable
rm(list=ls())
#Set the working directory
dataFdr="D:\\D Drive\\Certificate Course\\data"
filename="legal_weed_age_GSS2016_ch1.csv"
dataFile=paste(dataFdr,filename,sep="\\")
dataFile
## [1] "D:\\D Drive\\Certificate Course\\data\\legal_weed_age_GSS2016_ch1.csv"
library(readr)
data1=read_csv(dataFile)
## Rows: 2867 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): grass, age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1$age=as.numeric(data1$age)
## Warning: NAs introduced by coercion
data1$grass=as.factor(data1$grass)
summary(data1)
##        grass           age       
##  DK       : 110   Min.   :18.00  
##  IAP      : 911   1st Qu.:34.00  
##  LEGAL    :1126   Median :49.00  
##  NOT LEGAL: 717   Mean   :48.85  
##  NA's     :   3   3rd Qu.:62.00  
##                   Max.   :88.00  
##                   NA's   :32