#vector

#create
score = c(100, 90, 60, 70, 85, 90)
age = c(30, 35, 60, 50, 32, 25)

#select
score[3]

## [1] 60

age[5]

## [1] 32

age[10]

## [1] NA

length(score)

## [1] 6

length(age)

## [1] 6

Append element

age= c(age, 39)
age

## [1] 30 35 60 50 32 25 39

score = c(score, 78)
score

## [1] 100  90  60  70  85  90  78

age= c(40, age)
age

## [1] 40 30 35 60 50 32 25 39

age[1:4]

## [1] 40 30 35 60

age[5:8]

## [1] 50 32 25 39

age =c(age[1:4], 45, age[5:8])
age

## [1] 40 30 35 60 45 50 32 25 39

Select a vector segment

age[3:5]

## [1] 35 60 45

score[4:10]

## [1] 70 85 90 78 NA NA NA

###Remove element

age = age[-8]
age

## [1] 40 30 35 60 45 50 32 39

###Update

score

## [1] 100  90  60  70  85  90  78

score[4]= 80
score

## [1] 100  90  60  80  85  90  78

#conditional selection

age_less_50 = age[age<50]
age_less_50

## [1] 40 30 35 45 32 39

age_less_equal_50 = age[age<=50]
age_less_equal_50

## [1] 40 30 35 45 50 32 39

age_more_equal_50 = age[age>=50]
age_more_equal_50

## [1] 60 50

#40<=age<=50
age_greater_40 = age[age>=40]
age_greater_40

## [1] 40 60 45 50

age_40_50 = age_greater_40[age_greater_40<=50]
age_40_50

## [1] 40 45 50

age_40_50_v2 = age[age>=40 & age <=50]
age_40_50_v2

## [1] 40 45 50

###condition

result = score >=80
result

## [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE

Simple statistics

min(score)

## [1] 60

max(score)

## [1] 100

mean(score)

## [1] 83.28571

sum(score)

## [1] 583

median(score)

## [1] 85

sd(score)

## [1] 12.60574

var(score)

## [1] 158.9048

quantile(score)

##   0%  25%  50%  75% 100% 
##   60   79   85   90  100

###Correlation

score = c(score, c(100, 80, 20, 20))
score

##  [1] 100  90  60  80  85  90  78 100  80  20  20

age = c(age,c(30, 70, 75))
age

##  [1] 40 30 35 60 45 50 32 39 30 70 75

score

##  [1] 100  90  60  80  85  90  78 100  80  20  20

age

##  [1] 40 30 35 60 45 50 32 39 30 70 75

cor(score, age)

## [1] -0.7311604

cor(age, score)

## [1] -0.7311604

#Data Frame

data= data.frame(
ID = c(1, 2, 3, 4, 5),
name = c("A", "B", "S", "D", "P"),
score = c(100, 80, 20, 20, 30),
age = c(30, 35, 60, 50, 60)
)

data

#Select column

data$score

## [1] 100  80  20  20  30

data$age

## [1] 30 35 60 50 60

Selection: data [row,column]

data[2,]

data[ ,2]

## [1] "A" "B" "S" "D" "P"

data[2,2]

## [1] "B"

data[ ,c(3,4)]

data[ ,c(1,3,4)]

data[c(2,4) ,c(1,3,4)]

data[data$age <=50, ]

Select students score greater than equal 40

data[data$score >=40, c(1, 2, 3) ]

Select students with score 40 and age greater30.show only name and score

data[data$score >=40 & data$age>=30,c(2,3)]

###Ordering dataset

data[order(data$age),]

data[order(data$score, decreasing=TRUE),]

Add new col

dept = c("CS", "Bio","Genetices", "Phy", "Sports")
data = cbind(data,dept) #rbind for adding new row
data

Summary

summary(data)

##        ID        name               score          age         dept          
##  Min.   :1   Length:5           Min.   : 20   Min.   :30   Length:5          
##  1st Qu.:2   Class :character   1st Qu.: 20   1st Qu.:35   Class :character  
##  Median :3   Mode  :character   Median : 30   Median :50   Mode  :character  
##  Mean   :3                      Mean   : 50   Mean   :47                     
##  3rd Qu.:4                      3rd Qu.: 80   3rd Qu.:60                     
##  Max.   :5                      Max.   :100   Max.   :60

str(data)

## 'data.frame':    5 obs. of  5 variables:
##  $ ID   : num  1 2 3 4 5
##  $ name : chr  "A" "B" "S" "D" ...
##  $ score: num  100 80 20 20 30
##  $ age  : num  30 35 60 50 60
##  $ dept : chr  "CS" "Bio" "Genetices" "Phy" ...

#IRIS Dataset

iris=read.csv("iris.csv")
iris

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ sepal.length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ sepal.width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ petal.length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ petal.width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ variety     : chr  "Setosa" "Setosa" "Setosa" "Setosa" ...

summary(iris)

##   sepal.length    sepal.width     petal.length    petal.width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##    variety         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

scatter plot

library(ggplot2)
ggplot(iris,aes (x=sepal.length,y = sepal.width, color = variety))+geom_point()

###Boxplot

ggplot(iris,aes (y =sepal.length,x= sepal.width, color = variety,fill =variety))+geom_boxplot()

###Boxplot

ggplot(iris,aes (y =sepal.length, x= sepal.width, color = variety, fill =variety))+geom_violin()

## Warning: `position_dodge()` requires non-overlapping x intervals

R Notebook

Append element

Select a vector segment

Simple statistics

Selection: data [row,column]

Select students score greater than equal 40

Select students with score 40 and age greater30.show only name and score

Add new col

Summary

scatter plot