#vector

#create
score = c(100, 90, 60, 70, 85, 90)
age = c(30, 35, 60, 50, 32, 25)

#select
score[3]
## [1] 60
age[5]
## [1] 32
age[10]
## [1] NA
length(score)
## [1] 6
length(age)
## [1] 6

Append element

age= c(age, 39)
age
## [1] 30 35 60 50 32 25 39
score = c(score, 78)
score
## [1] 100  90  60  70  85  90  78
age= c(40, age)
age
## [1] 40 30 35 60 50 32 25 39
age[1:4]
## [1] 40 30 35 60
age[5:8]
## [1] 50 32 25 39
age =c(age[1:4], 45, age[5:8])
age
## [1] 40 30 35 60 45 50 32 25 39

Select a vector segment

age[3:5]
## [1] 35 60 45
score[4:10]
## [1] 70 85 90 78 NA NA NA

###Remove element

age = age[-8]
age
## [1] 40 30 35 60 45 50 32 39

###Update

score
## [1] 100  90  60  70  85  90  78
score[4]= 80
score
## [1] 100  90  60  80  85  90  78

#conditional selection

age_less_50 = age[age<50]
age_less_50
## [1] 40 30 35 45 32 39
age_less_equal_50 = age[age<=50]
age_less_equal_50
## [1] 40 30 35 45 50 32 39
age_more_equal_50 = age[age>=50]
age_more_equal_50
## [1] 60 50
#40<=age<=50
age_greater_40 = age[age>=40]
age_greater_40
## [1] 40 60 45 50
age_40_50 = age_greater_40[age_greater_40<=50]
age_40_50
## [1] 40 45 50
age_40_50_v2 = age[age>=40 & age <=50]
age_40_50_v2
## [1] 40 45 50

###condition

result = score >=80
result
## [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE

Simple statistics

min(score)
## [1] 60
max(score)
## [1] 100
mean(score)
## [1] 83.28571
sum(score)
## [1] 583
median(score)
## [1] 85
sd(score)
## [1] 12.60574
var(score)
## [1] 158.9048
quantile(score)
##   0%  25%  50%  75% 100% 
##   60   79   85   90  100

###Correlation

score = c(score, c(100, 80, 20, 20))
score
##  [1] 100  90  60  80  85  90  78 100  80  20  20
age = c(age,c(30, 70, 75))
age
##  [1] 40 30 35 60 45 50 32 39 30 70 75
score
##  [1] 100  90  60  80  85  90  78 100  80  20  20
age
##  [1] 40 30 35 60 45 50 32 39 30 70 75
cor(score, age)
## [1] -0.7311604
cor(age, score)
## [1] -0.7311604

#Data Frame

data= data.frame(
ID = c(1, 2, 3, 4, 5),
name = c("A", "B", "S", "D", "P"),
score = c(100, 80, 20, 20, 30),
age = c(30, 35, 60, 50, 60)
)

data

#Select column

data$score
## [1] 100  80  20  20  30
data$age
## [1] 30 35 60 50 60

Selection: data [row,column]

data[2,]
data[ ,2]
## [1] "A" "B" "S" "D" "P"
data[2,2]
## [1] "B"
data[ ,c(3,4)]
data[ ,c(1,3,4)]
data[c(2,4) ,c(1,3,4)]
data[data$age <=50, ]

Select students score greater than equal 40

data[data$score >=40, c(1, 2, 3) ]

Select students with score 40 and age greater30.show only name and score

data[data$score >=40 & data$age>=30,c(2,3)]

###Ordering dataset

data[order(data$age),]
data[order(data$score, decreasing=TRUE),]

Add new col

dept = c("CS", "Bio","Genetices", "Phy", "Sports")
data = cbind(data,dept) #rbind for adding new row
data

Summary

summary(data)
##        ID        name               score          age         dept          
##  Min.   :1   Length:5           Min.   : 20   Min.   :30   Length:5          
##  1st Qu.:2   Class :character   1st Qu.: 20   1st Qu.:35   Class :character  
##  Median :3   Mode  :character   Median : 30   Median :50   Mode  :character  
##  Mean   :3                      Mean   : 50   Mean   :47                     
##  3rd Qu.:4                      3rd Qu.: 80   3rd Qu.:60                     
##  Max.   :5                      Max.   :100   Max.   :60
str(data)
## 'data.frame':    5 obs. of  5 variables:
##  $ ID   : num  1 2 3 4 5
##  $ name : chr  "A" "B" "S" "D" ...
##  $ score: num  100 80 20 20 30
##  $ age  : num  30 35 60 50 60
##  $ dept : chr  "CS" "Bio" "Genetices" "Phy" ...

#IRIS Dataset

iris=read.csv("iris.csv")
iris
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ sepal.length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ sepal.width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ petal.length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ petal.width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ variety     : chr  "Setosa" "Setosa" "Setosa" "Setosa" ...
summary(iris)
##   sepal.length    sepal.width     petal.length    petal.width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##    variety         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

scatter plot

library(ggplot2)
ggplot(iris,aes (x=sepal.length,y = sepal.width, color = variety))+geom_point()

###Boxplot

ggplot(iris,aes (y =sepal.length,x= sepal.width, color = variety,fill =variety))+geom_boxplot()

###Boxplot

ggplot(iris,aes (y =sepal.length, x= sepal.width, color = variety, fill =variety))+geom_violin()
## Warning: `position_dodge()` requires non-overlapping x intervals