Importing the file from system into R

data<-read.csv(file.choose(),header = T)   # file.choose() used to point that file
data
##    Names Age   ID Gender Height     Region    Profit Talkitive
## 1 sujith  23 4567   male   5.11        A.P   41.9136       yes
## 2  kumar  24 3456   male   5.30     Andhra  219.5820        no
## 3  atanu  25 2153   male   5.50    Kolkata    6.8714        no
## 4  karum  27 2134   male   4.50 Pondichary -383.0310       yes
## 5   ravi  26 3214   male   5.40  Hyderabad    2.5164       yes
## 6  pavan  25 2614   male   3.40 Vijayavada   14.1694        no
## 7 shyama  22 2874 female   5.00     Kerala    1.9656       yes
## 8  pream  24 1234   male   2.10 Tamil nadu   90.7152       yes
# (or)
 
.csv file:-

data1<-read.table(file.choose(),header = T,sep = ",")
data1

Test file:-

data2<-read.delim(file.choose(),header = T,sep = "\t")   

# (or)

data2<-read.delim(file.choose(),header = T)

data2

Descriptive Statistics

Descriptive statistics is the term given to the analysis of data that helps describe, show or summarize data in a meaningful way

Mean

to asses the data from table we need to attach it.

attach(data)
mean(Age)
## [1] 24.5
mean(Age,trim = 0.10)             # to trim top and bottom 10%
## [1] 24.5

Median

median(Age)
## [1] 24.5

Mode

mode(Age)     # gives what the data type
## [1] "numeric"

table

table(Age)     # gives mode(how many times they repeated)
## Age
## 22 23 24 25 26 27 
##  1  1  2  2  1  1

variance

var(Age)
## [1] 2.571429

standard deviation

sd(Age)
## [1] 1.603567
# (or)
sqrt(var(Age))
## [1] 1.603567

Maximun

max(Age)
## [1] 27

Minimum

min(Age)
## [1] 22

Range

range(Age)
## [1] 22 27

Quantile

quantile(Age,probs = 0.90)
##  90% 
## 26.3
quantile(Age,probs = c(0,0.2,0.5,0.9,1))
##   0%  20%  50%  90% 100% 
## 22.0 23.4 24.5 26.3 27.0

Sum

sum(Age)
## [1] 196

co-relation

cor(Age,Height)
## [1] -0.004837542

co-variance

cov(Age,Height)         #var and cov are same
## [1] -0.009285714
var(Age,Height)
## [1] -0.009285714

summary

summary(Age)         # summary gives only min,max,median,1st&3rd quartile 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.00   23.75   24.50   24.50   25.25   27.00
summary(data)
##      Names        Age              ID          Gender      Height     
##  atanu  :1   Min.   :22.00   Min.   :1234   female:1   Min.   :2.100  
##  karum  :1   1st Qu.:23.75   1st Qu.:2148   male  :7   1st Qu.:4.225  
##  kumar  :1   Median :24.50   Median :2744              Median :5.055  
##  pavan  :1   Mean   :24.50   Mean   :2781              Mean   :4.539  
##  pream  :1   3rd Qu.:25.25   3rd Qu.:3274              3rd Qu.:5.325  
##  ravi   :1   Max.   :27.00   Max.   :4567              Max.   :5.500  
##  (Other):2                                                            
##         Region      Profit          Talkitive
##  A.P       :1   Min.   :-383.0310   no :3    
##  Andhra    :1   1st Qu.:   2.3787   yes:5    
##  Hyderabad :1   Median :  10.5204            
##  Kerala    :1   Mean   :  -0.6622            
##  Kolkata   :1   3rd Qu.:  54.1140            
##  Pondichary:1   Max.   : 219.5820            
##  (Other)   :2

Normal distribution

pnorm(q=70,mean=24,sd=1.6,lower.tail = T)   #P(x<=70)...70 %probablity..&.lower tail means less than 70
## [1] 1
pnorm(q=70,mean=24,sd=1.6,lower.tail = F) 
## [1] 4.528788e-182
#lower tail is F so it gives above 70 


pnorm(q=1,mean=0,sd=1,lower.tail = F)    #p(z>=1) z-scores
## [1] 0.1586553
#qnorm is used to calculate percentils r quantiles
#Q1(1st quartile)
qnorm(p=0.25,mean = 24,sd=1.6,lower.tail = T)
## [1] 22.92082

charts

Barchart

barplot(Age,main = "Age",xlab = "age",ylab = "values",las=1)

barplot(Age,main = "Age",xlab = "age",ylab = "values")

Piechart

pie(Height,radius = 1,clockwise = T,labels = names(Height),edges = 100)

                           #labels gives numbers,edges gives finess of the curve

Boxplot

boxplot(Height)

boxplot(Height,main="Height",ylab="numbers",ylim=c(0,6),las=1)

                                 # ylim gives y axis ranges from 0 to 6 and las gives numbers stright(hor instead of vertical) 


boxplot(Age~Gender)  # comparing two varables

boxplot(Age[Gender=="male"],Age[Gender=="female"]) #comparing 2 types in one varable

Histogram

hist(Age)     

hist(Age,freq = F)           #converting freq to density

hist(Age,breaks=7)           #breaks gives no of box

Scatterplot

plot(Age,Height,main = "Scatterplot",las=1,xlim = c(20,30),pch=2,col=4)

                          #  xlim= gives x axis range 20 to 30,pch gives shape,col= gives color

Stem and leaf

Height<-Height[Gender=="male"]  #assigning height of gender(male) to height
stem(Height,scale=1)   #scale=1 gives exact values..if scale=2 gives one time repeation
## 
##   The decimal point is at the |
## 
##   2 | 1
##   3 | 4
##   4 | 5
##   5 | 1345

Stacked Bar chart(To find relationship btw categorical variables)

table<-table(Names,Gender)
barplot(table)          #bars appear side by side

barplot(table,beside=T)  

Mosaic plot

mosaicplot(table)

END