ASSIGNMENT 1

problem1

SPECTRAL ANALYSIS

#setwd("E:/kanta")
Z<-read.csv("spectra.csv")
x<-Z$wavelength
y<-Z$absorb
mainlab=paste("wavelength vs absorbtion")
xlabel=paste("wavelength")
ylabel=paste("absorption")
plot(x,y,main = mainlab,col="red",cex=0.5,xlab = xlabel,ylab = ylabel)

jk<-which(x>450 & x<750)
length(jk)
## [1] 1557
wv<-x[jk]
aa<-y[jk]
plot(wv,aa,main = mainlab,col="red",cex=0.5,xlab = xlabel,ylab = ylabel)
mia<-max(aa)
mia
## [1] 0.386
miv<-which(aa==mia)
miv
## [1] 850
wvmax<-wv[850]
wvmax
## [1] 618.08
text(545,0.386,"wvmax= 618.08--->",cex = 0.7, col="blue")
lm(aa~wv)
## 
## Call:
## lm(formula = aa ~ wv)
## 
## Coefficients:
## (Intercept)           wv  
##   8.721e-02    2.193e-06
abline(h=0.386,v=618.08,col="black")

summary(wvmax)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   618.1   618.1   618.1   618.1   618.1   618.1

problem 2

DISTRICT WISE ANALYSIS OF THALASEMIA

#setwd("E:/kanta")
z<-read.csv("Murshidabad.csv")
z.m<-z
summary(z.m)
##       AGE    SEX       HPLC.NO             HB             RBC       
##  13 YRS :4   F: 3   Min.   : 13.00   Min.   :12.10   Min.   :3.200  
##  14 YRS :4   M:17   1st Qu.: 48.25   1st Qu.:12.20   1st Qu.:3.900  
##  15 YRS :2          Median :114.00   Median :12.40   Median :4.800  
##  17 YRS :2          Mean   :103.95   Mean   :12.37   Mean   :4.595  
##  8 YRS  :2          3rd Qu.:155.25   3rd Qu.:12.50   3rd Qu.:5.025  
##  11 YRS :1          Max.   :174.00   Max.   :12.80   Max.   :6.000  
##  (Other):5                                                          
##       PCV             MCV             MCH             MCHC      
##  Min.   :27.90   Min.   :62.00   Min.   :16.10   Min.   :25.40  
##  1st Qu.:33.80   1st Qu.:68.47   1st Qu.:19.70   1st Qu.:28.40  
##  Median :36.50   Median :71.40   Median :20.60   Median :28.75  
##  Mean   :35.32   Mean   :70.77   Mean   :20.57   Mean   :28.68  
##  3rd Qu.:37.17   3rd Qu.:73.38   3rd Qu.:21.30   3rd Qu.:29.60  
##  Max.   :39.10   Max.   :77.30   Max.   :25.00   Max.   :30.90  
##                                                                 
##       RDW             HbA0            HbA2            HbF      
##  Min.   :11.60   Min.   :84.30   Min.   :1.600   Min.   :0.10  
##  1st Qu.:12.62   1st Qu.:86.60   1st Qu.:2.375   1st Qu.:0.20  
##  Median :13.65   Median :87.45   Median :2.500   Median :0.25  
##  Mean   :14.23   Mean   :87.29   Mean   :2.400   Mean   :0.25  
##  3rd Qu.:15.47   3rd Qu.:88.33   3rd Qu.:2.600   3rd Qu.:0.30  
##  Max.   :21.50   Max.   :89.20   Max.   :2.600   Max.   :0.50  
##                                                                
##                MOLECULAR
##  alpha mutation (3) :7  
##  alpha mutation (34):2  
##  alpha mutation (4) :1  
##  alpha mutation (6) :2  
##  alpha mutation (8) :2  
##  normal             :6  
## 
#setwd("E:/kanta")
zx<-read.csv("Nadiatribes.csv")
zx.n<-zx
summary(zx.n)
##     HPLC..No            HB           X                RBC       
##  Min.   :  8.00   Min.   :12.10   Mode:logical   Min.   :4.510  
##  1st Qu.: 52.50   1st Qu.:12.25   NA's:47        1st Qu.:4.965  
##  Median : 87.00   Median :12.70                  Median :5.180  
##  Mean   : 79.06   Mean   :12.95                  Mean   :5.256  
##  3rd Qu.:106.50   3rd Qu.:13.45                  3rd Qu.:5.600  
##  Max.   :121.00   Max.   :14.90                  Max.   :6.030  
##       HCT             MCV             MCH             MCHC      
##  Min.   :31.60   Min.   :61.50   Min.   :17.60   Min.   :28.60  
##  1st Qu.:34.70   1st Qu.:68.80   1st Qu.:20.60   1st Qu.:29.90  
##  Median :36.80   Median :70.80   Median :21.40   Median :30.30  
##  Mean   :37.01   Mean   :70.45   Mean   :21.41   Mean   :30.38  
##  3rd Qu.:39.25   3rd Qu.:72.30   3rd Qu.:22.10   3rd Qu.:30.80  
##  Max.   :45.00   Max.   :74.90   Max.   :23.70   Max.   :31.80  
##       RDW             HBA0            HBA2            HBF        
##  Min.   :13.70   Min.   :86.00   Min.   :1.600   Min.   :0.1000  
##  1st Qu.:15.05   1st Qu.:86.50   1st Qu.:2.600   1st Qu.:0.2000  
##  Median :16.00   Median :87.00   Median :2.700   Median :0.3000  
##  Mean   :16.07   Mean   :87.01   Mean   :2.723   Mean   :0.3702  
##  3rd Qu.:16.80   3rd Qu.:87.35   3rd Qu.:2.900   3rd Qu.:0.5000  
##  Max.   :18.80   Max.   :88.90   Max.   :3.200   Max.   :0.9000  
##  MOLECULAR
##  amut:15  
##  n   :32  
##           
##           
##           
## 
#setwd("E:/kanta")
xz<-read.csv("birbhum.csv")
xz.b<-xz
summary(xz.b)
##       AGE     SEX                  SCHOOL   HPLC.TEST.NO.       HB       
##  12Y    : 9   F:12                    : 1   622A   : 1    Min.   :12.10  
##  14Y    : 5   M:38   N.A.T.HIGH SCHOOL:49   628A   : 1    1st Qu.:12.30  
##  14Y2M  : 3                                 631A   : 1    Median :12.75  
##  15Y7M  : 3                                 632A   : 1    Mean   :12.83  
##  11Y10M : 2                                 636A   : 1    3rd Qu.:13.20  
##  12Y6M  : 2                                 640A   : 1    Max.   :14.80  
##  (Other):26                                 (Other):44                   
##  DIAGNOSIS         HPLC.DATE       RBC             PCV       
##  Mode:logical   5.7.2016:10   Min.   :4.920   Min.   :37.50  
##  NA's:50        6.7.2016:31   1st Qu.:5.275   1st Qu.:40.42  
##                 7.7.2016: 6   Median :5.570   Median :41.45  
##                 8.7.2016: 3   Mean   :5.604   Mean   :41.71  
##                               3rd Qu.:5.950   3rd Qu.:43.15  
##                               Max.   :6.420   Max.   :47.50  
##                                                              
##       MCV             MCH             MCHC            RDW       
##  Min.   :65.70   Min.   :19.30   Min.   :29.40   Min.   :12.60  
##  1st Qu.:72.33   1st Qu.:21.70   1st Qu.:30.10   1st Qu.:14.40  
##  Median :75.30   Median :23.20   Median :30.80   Median :15.20  
##  Mean   :74.57   Mean   :22.99   Mean   :30.82   Mean   :15.29  
##  3rd Qu.:77.17   3rd Qu.:24.27   3rd Qu.:31.57   3rd Qu.:16.00  
##  Max.   :79.30   Max.   :26.00   Max.   :32.90   Max.   :18.90  
##                                                                 
##       HbA0            HbA2            HbF                  MOLECULAR 
##  Min.   :84.10   Min.   :2.100   Min.   :0.200   Alpha Mutation :29  
##  1st Qu.:85.70   1st Qu.:2.600   1st Qu.:0.200   Normal         :21  
##  Median :86.50   Median :2.800   Median :0.300                       
##  Mean   :86.49   Mean   :2.764   Mean   :0.336                       
##  3rd Qu.:87.28   3rd Qu.:2.900   3rd Qu.:0.400                       
##  Max.   :88.60   Max.   :3.700   Max.   :0.700                       
## 
#setwd("E:/kanta")
mn<-read.csv("jalpaiguri.csv")
mn.j<-mn
summary(mn.j)
##      SL.NO            NAME        SURNAME       AGE        SEX   
##  Min.   : 2.00   Sahil  : 2   Oraon   :6   Min.   :10.00   M:18  
##  1st Qu.: 6.25   Ashik  : 1   Oraon   :6   1st Qu.:11.00         
##  Median :10.50   Atish  : 1   Bhagat  :1   Median :12.00         
##  Mean   :10.50   Bipul  : 1   Lama    :1   Mean   :12.22         
##  3rd Qu.:14.75   Bishal : 1   Munda   :1   3rd Qu.:13.75         
##  Max.   :19.00   Jeet   : 1   Shaibya :1   Max.   :15.00         
##                  (Other):11   (Other) :2                         
##         School      HPLC.NO            HB          HPLC.TEST.NO.
##  Nathuahat :18   Min.   : 3.00   Min.   :12.10   1/Nathua : 1   
##                  1st Qu.:15.25   1st Qu.:12.22   17/Nathua: 1   
##                  Median :21.00   Median :12.45   23/Nathua: 1   
##                  Mean   :37.61   Mean   :12.49   31/Nathua: 1   
##                  3rd Qu.:69.25   3rd Qu.:12.68   33/Nathua: 1   
##                  Max.   :83.00   Max.   :13.10   37/Nathua: 1   
##                                                  (Other)  :12   
##     HPLC.DATE      RBC             PCV             MCV       
##  16.03.16:5   Min.   :4.550   Min.   :34.10   Min.   :66.70  
##  17.03.16:6   1st Qu.:4.787   1st Qu.:35.62   1st Qu.:70.05  
##  18.03.17:1   Median :5.055   Median :36.55   Median :71.50  
##  18.03.19:1   Mean   :5.121   Mean   :36.92   Mean   :72.32  
##  18.03.20:1   3rd Qu.:5.492   3rd Qu.:37.50   3rd Qu.:74.67  
##  18.03.26:3   Max.   :5.940   Max.   :41.20   Max.   :78.50  
##  31.03.16:1                                                  
##       MCH             MCHC            RDW             HbA0      
##  Min.   :21.20   Min.   :31.10   Min.   :14.00   Min.   :87.30  
##  1st Qu.:22.60   1st Qu.:31.90   1st Qu.:15.62   1st Qu.:87.40  
##  Median :23.10   Median :32.30   Median :16.45   Median :87.55  
##  Mean   :23.41   Mean   :32.35   Mean   :16.59   Mean   :87.70  
##  3rd Qu.:24.20   3rd Qu.:32.65   3rd Qu.:18.18   3rd Qu.:87.88  
##  Max.   :26.60   Max.   :33.90   Max.   :18.90   Max.   :88.50  
##                                                                 
##       HbA2            HbF         MOLICULAR.MUTATION
##  Min.   :2.700   Min.   :0.0000   amut:14           
##  1st Qu.:2.900   1st Qu.:0.0250   n   : 4           
##  Median :2.950   Median :0.2000                     
##  Mean   :3.000   Mean   :0.2278                     
##  3rd Qu.:3.175   3rd Qu.:0.3750                     
##  Max.   :3.300   Max.   :0.7000                     
## 
#setwd("E:/kanta")
abc<-read.csv("bankura.csv")
abc.b<-abc
summary(abc.b)
##        NAME         SURNAME         AGE        SEX          RBC       
##  SRIJAN  :  10   MURMU  :167   Min.   : 2.00    :  1   Min.   :3.450  
##  BISWAJIT:   9   MANDI  :120   1st Qu.:10.00   F:458   1st Qu.:4.800  
##  SONALI  :   9   SOREN  : 97   Median :13.00   M:692   Median :5.180  
##  MARSHAL :   8   TUDU   : 85   Mean   :12.79           Mean   :5.176  
##  RUPALI  :   8   HANSDA : 75   3rd Qu.:15.00           3rd Qu.:5.550  
##  Sagun   :   7   HEMBRAM: 63   Max.   :31.00           Max.   :8.450  
##  (Other) :1100   (Other):544   NA's   :1                              
##        HB             MCV               MCH             MCHC      
##  Min.   : 1.30   Min.   :  21.20   Min.   :12.60   Min.   : 7.90  
##  1st Qu.:11.20   1st Qu.:  71.00   1st Qu.:21.40   1st Qu.:29.70  
##  Median :11.90   Median :  74.40   Median :22.60   Median :30.60  
##  Mean   :11.99   Mean   :  83.04   Mean   :23.36   Mean   :30.59  
##  3rd Qu.:12.65   3rd Qu.:  80.25   3rd Qu.:25.04   3rd Qu.:31.50  
##  Max.   :35.10   Max.   :7608.00   Max.   :72.20   Max.   :45.50  
##                                                                   
##       RDW             HBA0            HBA2            HBF         
##  Min.   : 1.20   Min.   :54.30   Min.   :0.000   Min.   : 0.0000  
##  1st Qu.:14.80   1st Qu.:86.50   1st Qu.:2.600   1st Qu.: 0.2000  
##  Median :15.90   Median :87.60   Median :2.800   Median : 0.3000  
##  Mean   :16.18   Mean   :86.94   Mean   :2.942   Mean   : 0.3266  
##  3rd Qu.:17.10   3rd Qu.:88.20   3rd Qu.:3.000   3rd Qu.: 0.4000  
##  Max.   :70.10   Max.   :90.50   Max.   :6.900   Max.   :14.1000  
##                                                                   
##     X             X.1            X.2            X.3         
##  Mode:logical   Mode:logical   Mode:logical   Mode:logical  
##  NA's:1151      NA's:1151      NA's:1151      NA's:1151     
##                                                             
##                                                             
##                                                             
##                                                             
##                                                             
##    X.4            X.5               X.6      
##  Mode:logical   Mode:logical   Min.   :25.9  
##  NA's:1151      NA's:1151      1st Qu.:25.9  
##                                Median :25.9  
##                                Mean   :25.9  
##                                3rd Qu.:25.9  
##                                Max.   :25.9  
##                                NA's   :1150

DISTRICT WISE ANALYSIS OF THALASEMIA WITH BOX PLOT AND BAR PLOT ANALYSIS

#setwd("E:/kanta")
l<-read.csv(file = "Nadiatribes.csv")
z<-l$MOLECULAR
total<-length(z)
pk<- which(z=="n")
m<-length(pk)
alpha<-nrow(l)-m
prb<-alpha/total
prb
## [1] 0.3191489
hb<-l$HB
mean(hb)
## [1] 12.95106
hbf<-l$HBF
mean(hbf)
## [1] 0.3702128
hba0<-l$HBA0
mean(hba0)
## [1] 87.01277
hba2<-l$HBA2
mean(hba2)
## [1] 2.723404
alphamut<-which(z=="amut")
mean(hb[alphamut])
## [1] 13.05333
mean(hbf[alphamut])
## [1] 0.3533333
mean(hba0[alphamut])
## [1] 87.02
mean(hba2[alphamut])
## [1] 2.8
q<-read.csv(file = "birbhum.csv")
p<-q$MOLICULAR.MUTATION
totalbir<-length(p)
pkbir<- which(p=="amut")
mbir<-length(pkbir)

prbbir<-mbir/totalbir
prbbir
## [1] NaN
hbbir<-q$HB
mean(hbbir)
## [1] 12.832
hbfbir<-q$HbF
mean(hbfbir)
## [1] 0.336
hba0bir<-q$HbA0
mean(hba0bir)
## [1] 86.488
hba2bir<-q$HbA2
mean(hba2bir)
## [1] 2.764
alphamutbir<-which(p=="amut")
mean(hbbir[alphamutbir])
## [1] NaN
mean(hbfbir[alphamutbir])
## [1] NaN
mean(hba0bir[alphamutbir])
## [1] NaN
mean(hba2bir[alphamutbir])
## [1] NaN
f<-read.csv(file = "Murshidabad.csv")
g<-f$MOLECULAR
totalmur<-length(g)
pkmur<- which(g=="amut")
mmur<-length(pkmur)

prbmur<-mmur/totalmur
prbmur
## [1] 0
hbmur<-f$HB
mean(hbmur)
## [1] 12.37
hbfmur<-f$HbF
mean(hbfmur)
## [1] 0.25
hba0mur<-f$HbA0
mean(hba0mur)
## [1] 87.29
hba2mur<-f$HbA2

alphamutmur<-which(g=="amut")
mean(hbmur[alphamutmur])
## [1] NaN
mean(hbfmur[alphamutmur])
## [1] NaN
mean(hba0mur[alphamutmur])
## [1] NaN
mean(hba2mur[alphamutmur])
## [1] NaN
t<-read.csv(file = "JALPAIGURI.csv")
y<-t$MOLICULAR.MUTATION
totaljalp<-length(y)
pkjalp<- which(y=="amut")
mjalp<-length(pkjalp)

prbjalp<-mjalp/totaljalp
prbjalp
## [1] 0.7777778
hbjalp<-t$HB
mean(hbjalp)
## [1] 12.48889
hbfjalp<-t$HbF
mean(hbfjalp)
## [1] 0.2277778
hba0jalp<-t$HbA0
mean(hba0jalp)
## [1] 87.7
hba2jalp<-t$HbA2

alphamutjalp<-which(y=="amut")
mean(hbjalp[alphamutjalp])
## [1] 12.55
mean(hbfjalp[alphamutjalp])
## [1] 0.2142857
mean(hba0jalp[alphamutjalp])
## [1] 87.68571
mean(hba2jalp[alphamutjalp])
## [1] 3.042857
HB<-cbind(mean(hb),mean(hbbir),mean(hbmur),mean(hbjalp))
colnames(HB)<-c("Nhb","Bhb","Mhb","Jhb")
d<-rbind(HB)
print(d)
##           Nhb    Bhb   Mhb      Jhb
## [1,] 12.95106 12.832 12.37 12.48889
par(mfrow=c(2,2))
barplot(HB,col = "yellow")

HBF<-cbind(mean(hbf),mean(hbfbir),mean(hbfmur),mean(hbfjalp))
colnames(HBF)<-c("Nhbf","Bhbf","Mhbf","Jhbf")
d<-rbind(HBF)
print(d)
##           Nhbf  Bhbf Mhbf      Jhbf
## [1,] 0.3702128 0.336 0.25 0.2277778
par(mfrow=c(2,2))

barplot(HBF,col = "red")

HBa0<-cbind(mean(hba0),mean(hba0bir),mean(hba0mur),mean(hba0jalp))
colnames(HB)<-c("Nhba0","Bhba0","Mhba0","Jhba0")
d<-rbind(HB)
print(d)
##         Nhba0  Bhba0 Mhba0    Jhba0
## [1,] 12.95106 12.832 12.37 12.48889
par(mfrow=c(2,2))

barplot(HB,col = "blue")

HBa2<-cbind(mean(hba2),mean(hba2bir),mean(hba2mur),mean(hba2jalp))
colnames(HBa2)<-c("Nhba2","Bhba2","Mhba2","Jhba2")
d<-rbind(HBa2)
print(d)
##         Nhba2 Bhba2 Mhba2 Jhba2
## [1,] 2.723404 2.764   2.4     3
par(mfrow=c(2,2))

barplot(HBa2,col = "black")

bp.hb<-cbind(hb,hbmur,hbjalp,hbbir)
## Warning in cbind(hb, hbmur, hbjalp, hbbir): number of rows of result is not
## a multiple of vector length (arg 1)
colnames(bp.hb)<-c("N.HB","M.HB","J.HB","B.HB")
boxplot(bp.hb)

bp.hbf<-cbind(hbf,hbfmur,hbfjalp,hbfbir)
## Warning in cbind(hbf, hbfmur, hbfjalp, hbfbir): number of rows of result is
## not a multiple of vector length (arg 1)
colnames(bp.hbf)<-c("N.HBF","M.HBF","J.HBF","B.HBF")
boxplot(bp.hbf)

bp.hba0<-cbind(hba0,hba0mur,hba0jalp,hba0bir)
## Warning in cbind(hba0, hba0mur, hba0jalp, hba0bir): number of rows of
## result is not a multiple of vector length (arg 1)
colnames(bp.hba0)<-c("N.HBA0","M.HBA0","J.HBA0","B.HBA0")
boxplot(bp.hba0)

bp.hba2<-cbind(hba2,hba2mur,hba2jalp,hba2bir)
## Warning in cbind(hba2, hba2mur, hba2jalp, hba2bir): number of rows of
## result is not a multiple of vector length (arg 1)
colnames(bp.hba0)<-c("N.HBA2","M.HBA2","J.HBA2","B.HBA2")
boxplot(bp.hba2)

PROBLEM-3

Master file

#setwd("E:/kanta")
n<-read.csv("Nadiatribes.csv")
b<-read.csv("birbhum.csv")
m<-read.csv("Murshidabad.csv")
j<-read.csv("JALPAIGURI.csv")
nad<-cbind(n$HB,n$MCV,n$MCH,n$HBA0,n$HBA2,n$HBF,n$MOLECULAR)
distn<-rep("Nadiatribes",nrow(n))
f.n<-cbind(nad,distn)
bir<-cbind(b$HB,b$MCV,b$MCH,b$HbA0,b$HbA2,b$HbF,b$MOLECULAR)
distb<-rep("birbhum",nrow(b))
f.b<-cbind(bir,distb)
mur<-cbind(m$HB,m$MCV,m$MCH,m$HbA0,m$HbA2,m$HbF,m$MOLECULAR)
distm<-rep("Murshidabad",nrow(m))
f.m<-cbind(mur,distm)
jal<-cbind(j$HB,j$MCV,j$MCH,j$HbA0,j$HbA2,j$HbF,j$MOLICULAR.MUTATION)
distj<-rep("JALPAIGURI",nrow(j))
f.j<-cbind(jal,distj)
master<-rbind(f.n,f.b,f.m,f.j)
master1<-data.frame(master)
summary(master1)
##        V1           V2            V3            V4           V5    
##  12.2   :21   69.3   :  4   20.6   :  6   87.5   :10   2.7    :22  
##  12.1   :14   70.8   :  4   21.1   :  6   87.3   : 8   2.9    :20  
##  12.3   :12   69.8   :  3   21.8   :  5   86.5   : 7   2.6    :17  
##  12.4   :10   71.3   :  3   22.6   :  5   86.1   : 6   3      :17  
##  12.5   :10   71.9   :  3   23.1   :  5   86.7   : 6   2.8    :16  
##  12.9   : 9   72.3   :  3   24.2   :  5   87.4   : 6   2.5    :13  
##  (Other):59   (Other):115   (Other):103   (Other):92   (Other):30  
##        V6     V7             distn   
##  0.2    :43   1:65   birbhum    :50  
##  0.3    :40   2:59   JALPAIGURI :18  
##  0.4    :20   3: 1   Murshidabad:20  
##  0.5    : 8   4: 2   Nadiatribes:47  
##  0.6    : 6   5: 2                   
##  0.7    : 6   6: 6                   
##  (Other):12
bp<-cbind(master1$V1,master1$V2,master1$V3,master1$V4,master1$V5,master1$V6)
colnames(bp)<-c("HB","MCV","MCH","HBA0","HBA2","HBF")
boxplot(bp,main="boxplot")

barplot(bp,main = "BARPLOT",col = "blue")

M<-master1
hb<-master1$V1
mch<-master1$V3
mcv<-master1$V2
hba0<-master1$V4
hba2<-master1$V5
hbf<-master1$V6
disease<-master1$V7
x<-cbind(hb,mch,mcv,hba0,hba2,hbf)
colnames(x)<-c("HB","MCH","MCV","HBA0","HBA2","HBF")
x.pca<-prcomp(x,scale.=TRUE)
summary(x.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6
## Standard deviation     1.4197 1.1461 1.0618 0.8851 0.8038 0.3376
## Proportion of Variance 0.3359 0.2189 0.1879 0.1306 0.1077 0.0190
## Cumulative Proportion  0.3359 0.5548 0.7427 0.8733 0.9810 1.0000
screeplot(x.pca)

library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid

g<-ggbiplot(x.pca,obs.scale=1,var.scale=1,groups=disease,ellipse=TRUE,circle=TRUE) 
g <- g + scale_color_discrete(name = '')  
g <- g + theme(legend.direction = 'horizontal', legend.position = 'top') 
print(g)

#PCA ANALYSIS OF EACH DISTRICTS

## nadia

#setwd("E:/kanta")
k<-read.csv(file ="Nadiatribes.csv")
hb.nad<-k$HB
hba0.nad<-k$HBA0
hba2.nad<-k$HBA2
hbf.nad<-k$HBF
mcv.nad<-k$MCV
mch.nad<-k$MCH
mol.nad<-k$MOLECULAR
c<-cbind(hb.nad,hba0.nad,hba2.nad,hbf.nad,mcv.nad,mch.nad)
colnames(c)<-c("hb.nad","hba0.nad","hba2.nad","hbf.nad","mcv.nad","mch.nad")
n.pca<-prcomp(c,scale. =TRUE)
summary(n.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6
## Standard deviation     1.4175 1.2004 1.0570 0.8700 0.7829 0.25066
## Proportion of Variance 0.3349 0.2401 0.1862 0.1261 0.1022 0.01047
## Cumulative Proportion  0.3349 0.5750 0.7612 0.8874 0.9895 1.00000
screeplot(n.pca)

biplot(n.pca)

library(ggbiplot)
g<-ggbiplot(n.pca,obs.scale = 1,var.scale = 1,groups = mol.nad,ellipse = TRUE,circle = TRUE)
g<-g+ scale_color_discrete(name='')
g<-g+ theme(legend.direction='horizontal',legend.position='top')
print(g)

## birbhum


t<-read.csv(file ="birbhum.csv")
hb.bir<-t$HB
hba0.bir<-t$HbA0
hba2.bir<-t$HbA2
hbf.bir<-t$HbF
mcv.bir<-t$MCV
mch.bir<-t$MCH
mol.bir<-t$MOLICULAR.MUTATION
c1<-cbind(hb.bir,hba0.bir,hba2.bir,hbf.bir,mcv.bir,mch.bir)
colnames(c1)<-c("hb.bir","hba0.bir","hba2.bir","hbf.bir","mcv.bir","mch.bir")
b.pca<-prcomp(c1,scale. =TRUE)
summary(b.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.5002 1.2399 1.0553 0.8125 0.61801 0.23700
## Proportion of Variance 0.3751 0.2562 0.1856 0.1100 0.06366 0.00936
## Cumulative Proportion  0.3751 0.6313 0.8169 0.9270 0.99064 1.00000
screeplot(b.pca)

biplot(b.pca)

library(ggbiplot)
g<-ggbiplot(b.pca,obs.scale = 1,var.scale = 1,groups = mol.bir,ellipse = TRUE,circle = TRUE)
g<-g+ scale_color_discrete(name='')
g<-g+ theme(legend.direction='horizontal',legend.position='top')
print(g)

## murshidabad



s<-read.csv(file ="birbhum.csv")
hb.mur<-s$HB
hba0.mur<-s$HbA0
hba2.mur<-s$HbA2
hbf.mur<-s$HbF
mcv.mur<-s$MCV
mch.mur<-s$MCH
mol.mur<-s$MOLECULAR
c2<-cbind(hb.mur,hba0.mur,hba2.mur,hbf.mur,mcv.mur,mch.mur)
colnames(c2)<-c("hb.mur","hba0.mur","hba2.mur","hbf.mur","mcv.mur","mch.mur")
m.pca<-prcomp(c2,scale. =TRUE)
summary(m.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.5002 1.2399 1.0553 0.8125 0.61801 0.23700
## Proportion of Variance 0.3751 0.2562 0.1856 0.1100 0.06366 0.00936
## Cumulative Proportion  0.3751 0.6313 0.8169 0.9270 0.99064 1.00000
screeplot(m.pca)

biplot(m.pca)

library(ggbiplot)
g<-ggbiplot(m.pca,obs.scale = 1,var.scale = 1,groups = mol.mur,ellipse = TRUE,circle = TRUE)
g<-g+ scale_color_discrete(name='')
g<-g+ theme(legend.direction='horizontal',legend.position='top')
print(g)

## jalpaiguri


p<-read.csv(file ="JALPAIGURI.csv")
hb.jal<-p$HB
hba0.jal<-p$HbA0
hba2.jal<-p$HbA2
hbf.jal<-p$HbF
mcv.jal<-p$MCV
mch.jal<-p$MCH
mol.jal<-p$MOLICULAR.MUTATION
c3<-cbind(hb.jal,hba0.jal,hba2.jal,hbf.jal,mcv.jal,mch.jal)
colnames(c3)<-c("hb.jal","hba0.jal","hba2.jal","hbf.jal","mcv.jal","mch.jal")
j.pca<-prcomp(c3,scale. =TRUE)
summary(j.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6
## Standard deviation     1.5271 1.1994 1.0591 0.8235 0.59691 0.27108
## Proportion of Variance 0.3886 0.2397 0.1869 0.1130 0.05938 0.01225
## Cumulative Proportion  0.3886 0.6284 0.8153 0.9284 0.98775 1.00000
screeplot(j.pca)

biplot(j.pca)

library(ggbiplot)
g<-ggbiplot(j.pca,obs.scale = 1,var.scale = 1,groups = mol.jal,ellipse = TRUE,circle = TRUE)
g<-g+ scale_color_discrete(name='')
g<-g+ theme(legend.direction='horizontal',legend.position='top')
print(g)

DATA QUIZ 1

problem 1

#setwd("E:/kanta")
temp = list.files(pattern = "*.csv")
x<-list()
for(i in 1:5) {x[[i]]<-read.csv(temp[i])}
bind.data<-list()
for(i in 1:5){bind.data<-rbind(bind.data,x[i])}
(bind.data)
##      [,1]   
## [1,] List,16
## [2,] List,6 
## [3,] List,20
## [4,] List,17
## [5,] List,16

PROTEIN PDB FILE

library("bio3d")
## Warning: package 'bio3d' was built under R version 3.4.4
library("scatterplot3d")
## Warning: package 'scatterplot3d' was built under R version 3.4.4

problem 1

#setwd("E:/kanta")
a<-read.csv("bond.csv")
a
d<-a$atom
f<-a$x
g<-a$y
h<-a$z
x.ab<-f[1]-f[2]
y.ab<-g[1]-g[2]
z.ab<-h[1]-h[2]
x.cb<-f[3]-f[2]
y.cb<-g[3]-g[2]
z.cb<-h[3]-h[2]
u<-c(x.ab,y.ab,z.ab)
v<-c(x.cb,y.cb,z.cb)
dotuv<-u%*%v
modu<-sqrt(u%*%v)
modv<-sqrt(v%*%v)
mod<-modu%*%modv
ct<-dotuv/mod
ct%*%180
##          [,1]
## [1,] 127.2792

TORSION ANGLE (RAMACHANDRAN PLOT)

problem 2

s<-read.pdb("1b20")
##   Note: Accessing on-line PDB file
##    PDB has ALT records, taking A only, rm.alt=TRUE
summary(s)
## 
##  Call:  read.pdb(file = "1b20")
## 
##    Total Models#: 1
##      Total Atoms#: 3084,  XYZs#: 9252  Chains#: 3  (values: A B C)
## 
##      Protein Atoms#: 2595  (residues/Calpha atoms#: 326)
##      Nucleic acid Atoms#: 0  (residues/phosphate atoms#: 0)
## 
##      Non-protein/nucleic Atoms#: 489  (residues: 489)
##      Non-protein/nucleic resid values: [ HOH (486), ZN (3) ]
## 
## + attr: atom, xyz, seqres, helix, sheet,
##         calpha, remark, call
t<-torsion.pdb(s)
phi<-t$phi
psi<-t$psi
om<-t$omega
r1<-c(phi[1:10],psi[1:10])
(r1)
##  [1]         NA  -84.15111  -92.28564 -145.01497  -97.56397  -58.21666
##  [7]  -69.95596  -63.55970  -71.80134  -60.26555   56.07003  125.44498
## [13]   99.79264   15.87531  166.25670  -49.03518  -39.92110  -45.61653
## [19]  -40.67541  -44.24673
plot(om,pch=19)

plot(psi,phi,pch=19)

##SCATTER PLOT ###problem 3

summary(s$atom)
##      type               eleno           elety               alt           
##  Length:3084        Min.   :   1.0   Length:3084        Length:3084       
##  Class :character   1st Qu.: 775.8   Class :character   Class :character  
##  Mode  :character   Median :1551.5   Mode  :character   Mode  :character  
##                     Mean   :1552.7                                        
##                     3rd Qu.:2330.2                                        
##                     Max.   :3105.0                                        
##     resid              chain               resno        insert         
##  Length:3084        Length:3084        Min.   :  2   Length:3084       
##  Class :character   Class :character   1st Qu.: 34   Class :character  
##  Mode  :character   Mode  :character   Median : 71   Mode  :character  
##                                        Mean   :107                     
##                                        3rd Qu.:100                     
##                                        Max.   :567                     
##        x                 y                z               o         
##  Min.   :-23.172   Min.   : 3.066   Min.   :16.48   Min.   :0.0000  
##  1st Qu.:  4.653   1st Qu.:22.232   1st Qu.:37.97   1st Qu.:1.0000  
##  Median : 21.022   Median :33.679   Median :47.63   Median :1.0000  
##  Mean   : 17.226   Mean   :32.855   Mean   :48.32   Mean   :0.9827  
##  3rd Qu.: 29.867   3rd Qu.:42.504   3rd Qu.:57.38   3rd Qu.:1.0000  
##  Max.   : 47.164   Max.   :63.576   Max.   :83.82   Max.   :1.0000  
##        b            segid              elesy              charge         
##  Min.   : 6.69   Length:3084        Length:3084        Length:3084       
##  1st Qu.:11.66   Class :character   Class :character   Class :character  
##  Median :14.09   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :16.91                                                           
##  3rd Qu.:20.00                                                           
##  Max.   :54.99
a<-s$atom[, c("x","y","z","elety")]
aa<-which(a$elety=="CA")
a.a<-s$atom[, c("x","y","z")]
r<-a.a[aa,]
summary(r)
##        x                 y                z        
##  Min.   :-19.196   Min.   : 8.307   Min.   :19.96  
##  1st Qu.:  3.939   1st Qu.:22.906   1st Qu.:38.28  
##  Median : 20.413   Median :33.884   Median :47.77  
##  Mean   : 16.839   Mean   :33.154   Mean   :48.77  
##  3rd Qu.: 29.245   3rd Qu.:42.310   3rd Qu.:57.91  
##  Max.   : 45.868   Max.   :59.271   Max.   :80.55
scatterplot3d(r,highlight.3d = TRUE, col.axis = "red",col.grid = "lightblue", main = "scatterplot 3D",pch = 20)

ik<-which(s$atom$elety=="CA")
CAxyz<-s$atom[ik,c("x","y","z")]
plot(CAxyz)

##PROTEIN BLAST ###problem 4

blast<-blast.pdb(pdbseq(s))
##  Searching ... please wait (updates every 5 seconds) RID = E8JYTV1E015 
##  
##  Reporting 156 hits
top.hits<-plot.blast(blast, cutoff = 25)
##   * Possible cutoff values:    133 -3 
##             Yielding Nhits:    103 156 
## 
##   * Chosen cutoff value of:    25 
##             Yielding Nhits:    103

head(top.hits$hits)
##   pdb.id   acc      group
## 1 "1B20_A" "1B20_A" "1"  
## 2 "1B20_A" "1B20_A" "1"  
## 3 "1B20_A" "1B20_A" "1"  
## 4 "1B21_A" "1B21_A" "1"  
## 5 "1B21_A" "1B21_A" "1"  
## 6 "1B21_A" "1B21_A" "1"
a<-blast$raw
returned.pdbids<-a$subjectids
(returned.pdbids)
##   [1] "1B20_A" "1B20_A" "1B20_A" "1B21_A" "1B21_A" "1B21_A" "1BRN_L"
##   [8] "1BRN_L" "1BRN_L" "1FW7_A" "1FW7_A" "1FW7_A" "1BSA_A" "1BSA_A"
##  [15] "1BSA_A" "1BSB_A" "1BSB_A" "1BSB_A" "1BSC_A" "1BSC_A" "1BSC_A"
##  [22] "1BSD_A" "1BSD_A" "1BSD_A" "1BAO_A" "1BAO_A" "1BAO_A" "1BAN_A"
##  [29] "1BAN_A" "1BAN_A" "1BSE_A" "1BSE_A" "1BSE_A" "1B2Z_A" "1B2Z_A"
##  [36] "1B2Z_A" "1BNS_A" "1BNS_A" "1BNS_A" "1BNE_A" "1BNE_A" "1BNE_A"
##  [43] "1B2S_A" "1B2S_A" "1B2S_A" "2ZA4_A" "2ZA4_A" "2ZA4_A" "1BRI_A"
##  [50] "1BRI_A" "1BRI_A" "1BRJ_A" "1BRJ_A" "1BRJ_A" "1BRK_A" "1BRK_A"
##  [57] "1BRK_A" "1BNF_A" "1BNF_A" "1BNF_A" "2C4B_A" "2C4B_A" "2C4B_A"
##  [64] "1BRH_A" "1BRH_A" "1BRH_A" "1X1Y_A" "1X1Y_A" "1X1Y_A" "2F4Y_A"
##  [71] "2F4Y_A" "2F4Y_A" "1B3S_A" "1B3S_A" "1B3S_A" "1RNB_A" "1RNB_A"
##  [78] "1RNB_A" "1BNG_A" "1BNG_A" "1BNG_A" "1BRG_A" "1BRG_A" "1BRG_A"
##  [85] "3DA7_A" "3DA7_A" "3DA7_A" "3DA7_A" "3Q3F_A" "3Q3F_A" "3Q3F_A"
##  [92] "1BUJ_A" "1BUJ_A" "1BUJ_A" "2RBI_A" "2RBI_A" "2RBI_A" "4HAA_A"
##  [99] "4HAA_A" "4HAA_A" "1GOY_A" "1GOY_A" "1GOY_A" "1MGR_A" "1MGR_A"
## [106] "1MGR_A" "4Q5N_A" "1T2I_A" "1T2I_A" "1T2I_A" "2I8L_A" "2E85_A"
## [113] "1PY3_A" "1PY3_A" "1PY3_A" "1UCK_A" "1UCK_A" "1UCK_A" "1UCL_A"
## [120] "1UCL_A" "1UCL_A" "4J5G_A" "4J5G_A" "4J5G_A" "1AY7_A" "1AY7_A"
## [127] "1AY7_A" "1UCI_A" "1UCI_A" "1UCI_A" "3A5E_A" "3A5E_A" "3A5E_A"
## [134] "4J5K_A" "4J5K_A" "4J5K_A" "1RGF_B" "1RGF_B" "1RGF_B" "4GHO_A"
## [141] "4GHO_A" "4GHO_A" "2QMI_A" "1SAR_A" "1I70_A" "1I8V_A" "1T2H_A"
## [148] "1UCJ_A" "1YNV_X" "1YNV_X" "1YNV_X" "4B6Z_A" "1BOX_A" "6AZI_A"
## [155] "6AZI_A" "6AZI_A"
p<-a$subjectids
p[10]
## [1] "1FW7_A"
p[40]
## [1] "1BNE_A"
p[100]
## [1] "4HAA_A"
plot(a$bitscore,a$evalue)

plot(a$evalue,type = "l")

plot(a$bitscore,type = "l")

#DATA QUIZ2 ##problem 5

y<-read.pdb("1ban")
##   Note: Accessing on-line PDB file
summary(y)
## 
##  Call:  read.pdb(file = "1ban")
## 
##    Total Models#: 1
##      Total Atoms#: 2825,  XYZs#: 8475  Chains#: 3  (values: A B C)
## 
##      Protein Atoms#: 2559  (residues/Calpha atoms#: 325)
##      Nucleic acid Atoms#: 0  (residues/phosphate atoms#: 0)
## 
##      Non-protein/nucleic Atoms#: 266  (residues: 266)
##      Non-protein/nucleic resid values: [ HOH (266) ]
## 
## + attr: atom, xyz, seqres, helix, sheet,
##         calpha, remark, call
t1<-torsion.pdb(y)
phi1<-t1$phi
psi1<-t1$psi
om1<-t1$omega


z<-read.pdb("1box")
##   Note: Accessing on-line PDB file
##    PDB has ALT records, taking A only, rm.alt=TRUE
summary(z)
## 
##  Call:  read.pdb(file = "1box")
## 
##    Total Models#: 1
##      Total Atoms#: 823,  XYZs#: 2469  Chains#: 1  (values: A)
## 
##      Protein Atoms#: 728  (residues/Calpha atoms#: 95)
##      Nucleic acid Atoms#: 0  (residues/phosphate atoms#: 0)
## 
##      Non-protein/nucleic Atoms#: 95  (residues: 95)
##      Non-protein/nucleic resid values: [ HOH (95) ]
## 
## + attr: atom, xyz, seqres, helix, sheet,
##         calpha, remark, call
t2<-torsion.pdb(z)
phi2<-t2$phi
psi2<-t2$psi

x<-read.pdb("2rbi")
##   Note: Accessing on-line PDB file
summary(x)
## 
##  Call:  read.pdb(file = "2rbi")
## 
##    Total Models#: 1
##      Total Atoms#: 1947,  XYZs#: 5841  Chains#: 2  (values: A B)
## 
##      Protein Atoms#: 1718  (residues/Calpha atoms#: 216)
##      Nucleic acid Atoms#: 0  (residues/phosphate atoms#: 0)
## 
##      Non-protein/nucleic Atoms#: 229  (residues: 229)
##      Non-protein/nucleic resid values: [ HOH (229) ]
## 
## + attr: atom, xyz, seqres, helix, sheet,
##         calpha, remark, call
t3<-torsion.pdb(x)
phi3<-t3$phi
psi3<-t3$psi



plot(psi,phi,pch=19,col="blue",cex=0.5,text(-100,150,"1.b=1ZNI",col="blue"))
points(psi2,phi2,pch=19,col="green",cex=0.4,text(-25,150,"2.g=1SJU",col="green"))
points(psi1,phi1,pch=19,col="red",cex=0.4,text(45,15,"3.r=2INS",col = "red"))
points(psi3,phi3,pch=19,col="pink",cex=0.4,text(120,150,"4.bla=5BQQ",col = "black"))

ASSIGNMENT 4

problem 1:

Given three points (A,B,C) write an R script to determine the cross product of the two vectors AB and BC

#setwd("E:/kanta")
z<-read.csv("xyz.csv")
x1<-z$x
y1<-z$y
z1<-z$z
AB<-c(x1[2]-x1[1],y1[2]-y1[1],z1[2]-z1[1])
BC<-c(x1[3]-x1[2],y1[3]-y1[2],z1[3]-z1[2])

crs<-function(u,v){
  c1<- u[2]*v[3]-v[2]*u[3]
  c2<- u[3]*v[1]-v[3]*u[1]
  c3<- u[1]*v[2]-v[1]*u[2]
  
  return(c(c1,c2,c3))
}
K<-crs(AB,BC)
(K)
## [1]  2.46  4.90 -0.54

problem 2

Suppose you have x and y variables with some NA elements .Write a program to find

(a) Mean of x

(b) Mean of y

(c) Plot of y against x
#setwd("E:/kanta")
w<-read.csv("book new.csv")
f<-w$x
f1<-is.na(f)
f2<-which(f1)
b<-f[-f2]
b
##  [1]  4  4 74  7  4  4 96  4  5  5  5 98 46  8  9  1  5  6  5
mean(b)
## [1] 20.52632
g<-w$y
g1<-is.na(g)
g2<-which(g1)
a<-g[-g2]
a
##  [1]   4   4   7  78   9 596  87  78  55   4   5  78   7   2  22  48   5
## [18]   7   7
mean(a)
## [1] 58.05263
plot(b,a)

problem 4

Suppose you have a spectrum with more than one peaks. Write a program to find out all the peaks

##PROBLEM-4
library("readxl")
#setwd("E:/kanta")
p4<-read_xlsx("rti.xlsx")
R<-p4$Ramean
W<-p4$wavelen
wv1<-(401-320)
wv1
## [1] 81
wv2<-wv1+(501-401)
wv2
## [1] 181
wv3<-wv2+(601-501)
wv3
## [1] 281
peak1=0
peak2=0
peak3=0
for(i in 1:wv1){
  peak1[i]<-R[i]
}
for(i in (wv1+1):wv2){
  peak2[i-wv1]<-R[i]
}
for(i in (wv2+1):wv3){
  peak3[i-wv2]<-R[i]
}
plot(W,R,cex=0.7,col="blue",main="PLOT R",xlab = "Wavelength",ylab = "Fluorescence intensity")

plot(peak1,cex=0.7,col="RED",main="PLOT R.1")

plot(peak2,cex=0.7,col="BLACK",main="PLOT R.2")

plot(peak3,cex=0.7,col="VIOLET",main="PLOT R.3")

problem 5

Write a program to generate a random sequence of amino acids.

all<-"ARNDCQEGHILKMFPSTWYV"
ss<-unlist(strsplit(all,""))
for(i in 1:20){cat(sample(ss))
  cat("\n")}
## L Q M R Y K A H E G W F D I C P S T V N
## H T W G K I S Q C N D L F A P M V R Y E
## R N D I P L C W Y H K E T M G S V A F Q
## Y V K A R P C Q D T I W G F N H L M S E
## I K W P R G N C S M T H V D Y L F Q A E
## E P F T S I W M C K A D Q L G N H V Y R
## Q E G F D N A K M L I T S V Y H C P R W
## Y D S E W M N L K A V H G R P Q T F C I
## G H N P S T E A K V D Y I W C F M R L Q
## V N D Y G M R S F P W Q I A H L K E C T
## K C M S L D F E R V Q N I P G Y H A T W
## A L W D Q K H E V G P S R C N I Y T M F
## N I P F S T M L V W Y H G D R E K Q C A
## P Y H V I A W C D K F M S G E L R N Q T
## Y F V A E P R Q K L N H S C I T G W M D
## M T V C N W F D K I L S A Q P H R G Y E
## Q G H P Y K R E A S F V M C W T I L N D
## Y F R T K P Q V C A G L D H N W E S M I
## W R N A E V I F K Q H M T Y L C G P D S
## H L Q S V E P I G W C F R D N M T Y A K

CHI-SQUARE TEST

#setwd("E:/kanta")
u<-read.csv("aspirincsv.csv")
a<-u$condition
#chi-square-analysis:
con<-u$condition
pla<-which(con=="placebo")
chg<-u$change
mn<-mean(chg)
h<-chg-mn
h[pla]
##  [1] -0.05333333  1.04666667  1.14666667  0.64666667  2.24666667
##  [6]  3.14666667  1.24666667  0.94666667  1.44666667  2.04666667
## [11]  1.64666667  1.24666667  0.74666667  1.04666667  2.04666667
## [16]  1.14666667
pm<-h[pla]
which(pm>mn)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16
pp<-which(pm>mn)
length(pp)
## [1] 16
h
##  [1] -0.05333333  1.04666667 -2.95333333  1.14666667 -2.15333333
##  [6] -0.75333333  0.04666667 -3.35333333  0.64666667  2.24666667
## [11] -0.25333333 -2.45333333 -3.95333333  3.14666667 -1.65333333
## [16]  1.24666667  0.94666667  1.44666667 -2.35333333  2.04666667
## [21] -1.35333333  1.64666667  1.24666667  0.74666667  1.04666667
## [26]  2.04666667  0.94666667 -0.15333333 -1.35333333  1.14666667
mean(h)
## [1] -1.258433e-16
k<-u$change
k
##  [1] -3.2 -2.1 -6.1 -2.0 -5.3 -3.9 -3.1 -6.5 -2.5 -0.9 -3.4 -5.6 -7.1  0.0
## [15] -4.8 -1.9 -2.2 -1.7 -5.5 -1.1 -4.5 -1.5 -1.9 -2.4 -2.1 -1.1 -2.2 -3.3
## [29] -4.5 -2.0
mean(k)
## [1] -3.146667
h
##  [1] -0.05333333  1.04666667 -2.95333333  1.14666667 -2.15333333
##  [6] -0.75333333  0.04666667 -3.35333333  0.64666667  2.24666667
## [11] -0.25333333 -2.45333333 -3.95333333  3.14666667 -1.65333333
## [16]  1.24666667  0.94666667  1.44666667 -2.35333333  2.04666667
## [21] -1.35333333  1.64666667  1.24666667  0.74666667  1.04666667
## [26]  2.04666667  0.94666667 -0.15333333 -1.35333333  1.14666667
which(k>mean(k))
##  [1]  2  4  7  9 10 14 16 17 18 20 22 23 24 25 26 27 30
length(which(k>mean(k)))
## [1] 17
which(k>mean(k)& u$condition=="placebo")
##  [1]  2  4  9 10 14 16 17 18 20 22 23 24 25 26 30
l<-which(k>mean(k) & u$condition=="placebo")

length(l)
## [1] 15
u$change<-((u$change)**2)**0.5
u$change
##  [1] 3.2 2.1 6.1 2.0 5.3 3.9 3.1 6.5 2.5 0.9 3.4 5.6 7.1 0.0 4.8 1.9 2.2
## [18] 1.7 5.5 1.1 4.5 1.5 1.9 2.4 2.1 1.1 2.2 3.3 4.5 2.0
m<-mean(u$change)  
m
## [1] 3.146667
ik<-which(u$change>m &u$condition=="placebo")
length(ik)
## [1] 1
jk<-which(u$change<m & u$condition=="placebo")
length(jk)
## [1] 15
kk<-which(u$change>m & u$condition=="aspirin")
length(kk)
## [1] 12
pk<-which(u$change<m & u$condition=="aspirin")
length(pk)
## [1] 2
placebo<-c(length(ik),length(jk))
placebo
## [1]  1 15
placebo<-cbind(length(ik),length(jk))
placebo
##      [,1] [,2]
## [1,]    1   15
colnames(placebo)<-c("+","-")
placebo
##      +  -
## [1,] 1 15
aspirin<-cbind(length(kk),length(pk))
data.file<-rbind(placebo,aspirin)
data.file
##       +  -
## [1,]  1 15
## [2,] 12  2
colnames(data.file)<-c("+","-")
data.file
##       +  -
## [1,]  1 15
## [2,] 12  2
rownames(data.file)<-c("placebo","aspirin")
colnames(data.file)<-c("+","-")
data.file
##          +  -
## placebo  1 15
## aspirin 12  2
#probability of value

pa.pos<-((data.file[2,2]+data.file[2,1])/30)*((data.file[1,1]+data.file[2,1])/30)
po.pos<-((data.file[1,1]+data.file[1,2])/30)*((data.file[1,1]+data.file[2,1])/30)
po.pos
## [1] 0.2311111
pi.pos<-((data.file[2,1]+data.file[2,2])/30)*((data.file[1,2]+data.file[2,2])/30)
pi.pos
## [1] 0.2644444
pp.pos<-((data.file[1,2]+data.file[1,1])/30)*((data.file[1,2]+data.file[2,2])/30)
pp.pos
## [1] 0.3022222
pap<-pa.pos*30
pop<-po.pos*30
pip<-pi.pos*30
ppp<-pp.pos*30
e11<-(data.file[1,1]-pop)^2/pop
e12<-(data.file[1,2]-ppp)^2/ppp
e21<-(data.file[2,1]-pip)^2/pip
e22<-(data.file[2,2]-pap)^2/pap
chivalue<-sum(e11,e12,e21,e22)
chivalue
## [1] 13.77101

ASSIGNMENT 6

CHI-SQUARE VALUE AND PROTEIN COMPARISM

library(bio3d)
a<-read.pdb("1wba")
##   Note: Accessing on-line PDB file
t.a<-torsion.pdb(a)
phi.a<-t.a$phi
b<-is.na(phi.a)
b1<-which(b)
phi.ak<-phi.a[-b1]
psi.a<-t.a$psi
e<-is.na(psi.a)
e1<-which(e)
psi.ak<-psi.a[-e1]
par(mfrow=c(2,2))
plot(phi.ak,psi.ak,xlim = c(-180,180),ylim = c(-180,180),main = "1wba")
abline(h=c(0,0),v=c(0,0))
phi1<- -50.0
phi2<- -100.0
ik<-which(phi.ak<phi1 & phi.ak>phi2)
phial.ak<-length(ik)
phi3<- -40.0
phi4<- -150.0
ij<-which(phi.ak<phi3 & phi.ak>phi4)
phib.ak<-length(ij)
psi5<- -80.0
psi6<- 10.0
il<-which(psi.ak<psi6 & psi.ak>psi5)
psial.ak<-length(il)
psi7<- 110.0
psi8<- 160.0
im<-which(psi.ak>psi7 & psi.ak<psi8)
psib.ak<-length(im)
r<-cbind(phial.ak,psial.ak,phib.ak,psib.ak)
rownames(r)<-c("1wba")
(r)
##      phial.ak psial.ak phib.ak psib.ak
## 1wba       94       28     154     104

u<-read.pdb("3ua0")
##   Note: Accessing on-line PDB file
t.ua<-torsion.pdb(u)
phi.ua<-t.ua$phi
x<-is.na(phi.ua)
x1<-which(x)
phi.ua<-phi.ua[-x1]
psi.ua<-t.ua$psi
s<-is.na(psi.ua)
s1<-which(s)
psi.ua<-psi.ua[-s1]
plot(phi.ua,psi.ua,xlim = c(-180,180),ylim = c(-180,180),main = "3ua0")
abline(h=c(0,0),v=c(0,0))

phi9<- -50.0
phi10<- -110.0
ik1<-which(phi.ua<phi9 & phi.ua>phi10)
phial.ua<-length(ik1)
phi11<- -50.0
phi12<- -150.0
ij1<-which(phi.ak<phi11 & phi.ak>phi12)
phib.ua<-length(ij1)
psi13<- -50.0
psi14<- 10.0
il1<-which(psi.ua<psi14 & psi.ua>psi13)
psial.ua<-length(il1)
psi15<- 100.0
psi16<- 160.0
im1<-which(psi.ua>psi15 & psi.ua<psi16)
psib.ua<-length(im1)
r1<-cbind(phial.ua,psial.ua,phib.ua,psib.ua)
rownames(r1)<-c("3ua0")
(r1)
##      phial.ua psial.ua phib.ua psib.ua
## 3ua0       60       15     154      90
main.pro<-rbind(r,r1)
main.pro
##      phial.ak psial.ak phib.ak psib.ak
## 1wba       94       28     154     104
## 3ua0       60       15     154      90
colnames(main.pro)<-c("Phi-alpha","Psi-alpha","Phi-beta","Psi-beta")
main.pro
##      Phi-alpha Psi-alpha Phi-beta Psi-beta
## 1wba        94        28      154      104
## 3ua0        60        15      154       90
chisq.test(main.pro)
## 
##  Pearson's Chi-squared test
## 
## data:  main.pro
## X-squared = 7.1784, df = 3, p-value = 0.06642

T-TEST PROBLEM

x<-c(0.593, 0.142, 0.329, 0.691, 0.231, 0.793, 0.519, 0.392, 0.418) 
t.test<-function(dataset){
  s<-sd(dataset)
  d.mean<-mean(dataset)
  n1<-length(dataset)
  n<-n1**0.5
  u<-0.3
  t<-(d.mean)/(s/n)
  return(t)
}
t.test<-t.test(x)
t.test
## [1] 6.43351
mean(x)
## [1] 0.4564444
#INTERPRETATION
##from the above ,it was found that the mean of the above data ,level of Salmonella (MPN/g) of randomly sampled batches of ice cream is 0.4564444.We have taken the mean value for the Null hypothesis as 0.3,ie the Salmonella level is equal to 0.3. As the mean value found here is greater than the null hyopthesis, it is not possible to accept the Null hypothesis,we have to accept the alternate hypothesis,ie the level of Salmonella is more than 0.3 MPN/g.

problem2

ballet<-c(89.2,78.2,89.3,88.3,87.3,90.1,95.2,94.3,78.3,89.3)
mean1<-mean(ballet)
sd.ball<-sd(ballet)
sd.ball
## [1] 5.690587
s1<-sd.ball^2
s1
## [1] 32.38278
l1<-length(ballet)
lb<-l1-1
lb
## [1] 9
FOOTBALL<-c(79.3,78.3,85.3,79.3,88.9,91.2,87.2,89.2,93.3,79.9)
mean2<-mean(FOOTBALL)
sd.foot<-sd(FOOTBALL)
sd.foot
## [1] 5.583995
s2<-sd.foot^2
s2
## [1] 31.181
l2<-length(FOOTBALL)
lf<-l2-1
lf
## [1] 9
L<-l1+l2-1
L
## [1] 19
sp<-(s1*lb + s2*lf)/L
sp
## [1] 30.10916
Sq<-(sp*(1/l1 + 1/l2))^0.5
Sq
## [1] 2.453942
MM<-mean1 - mean2
MM
## [1] 2.76
ttest2<-MM/Sq
ttest2
## [1] 1.124721
#Interpritation:
## the t table does not give you the precise probability of every t value,you can use it for hypothesis testing.according to the given reference index,our value is 1.124721 which is almost near to 1.372 haveing p value=0.10
### so at 95% confidence level,haveing value greater than 0.05 it indicate null hypothesis.
####This means that we can not distinguish between the fitness level between the ballet dancers and the football player.

problem3

data<-c(220, 200, 240, 210, 225 ,210, 180, 170 ,210 ,220 ,190 ,180 ,195 ,190 ,200, 190, 210, 220,240,210) 
M<-matrix(data,nrow=10,byrow=TRUE)
colnames(M)<-c("Before","After")
(M)
##       Before After
##  [1,]    220   200
##  [2,]    240   210
##  [3,]    225   210
##  [4,]    180   170
##  [5,]    210   220
##  [6,]    190   180
##  [7,]    195   190
##  [8,]    200   190
##  [9,]    210   220
## [10,]    240   210
t.test(M[,1],M[,2],alternative="two.sided",mu=0,paired=TRUE,var.equal=FALSE,conf.level=0.95)
## 
##  Paired t-test
## 
## data:  M[, 1] and M[, 2]
## t = 2.5017, df = 9, p-value = 0.03377
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   1.053366 20.946634
## sample estimates:
## mean of the differences 
##                      11

ASSIGNMENT 2

PROJECT 1

library("bio3d")
library("Peptides")
## Warning: package 'Peptides' was built under R version 3.4.4
#membrane protein
#setwd("E:/kanta")
mem<-read.pdb("4nyk.pdb")
seq.mem<-pdbseq(mem)
#Amino acid descriptors for the membrane protein compared with the reference 20 aa present

#Thermostability value of the membrane protein
seq.mem.i<-aIndex(seq.mem)
ik<-which(seq.mem.i==0.0)
seq.mem.in<-seq.mem.i[-ik]
therm.mem<-sum(seq.mem.in)/length(seq.mem.in)
(therm.mem)
## [1] 302.2523
#Protein Interaction
seq.mem.pp<-boman(seq.mem)
bin.pot<-sum(seq.mem.pp)/length(seq.mem)
bin.pot
## [1] 1.41686
#binpot value = 1.41686, which is <2.68 which implies the protein has lower affinity to bind with another protein
#Cytosolic protein
cyto.ibv<-read.pdb("3ibv.pdb")
seq.ibv<-pdbseq(cyto.ibv)
#Amino acid descriptors for the cytosolic protein compared with the reference 20 aa present

#Thermostability value of the cytosolic protein
seq.ibv.i<-aIndex(seq.ibv)
jk<-which(seq.ibv.i==0.0)
seq.ibv.in<-seq.ibv.i[-jk]
therm.ibv<-sum(seq.ibv.in)/length(seq.ibv.in)
(therm.ibv)
## [1] 315.2815
#Protein Interaction
seq.ibv.pp<-boman(seq.ibv)
bin.pot1<-sum(seq.ibv.pp)/length(seq.ibv)
(bin.pot1)
## [1] 1.08492
#binpot<2.68 which implies the protein has lower affinity to bind with another protein
#Principle components
#A function for protein pca
protanapca<-function(seq)
  {library("bio3d")
  library("Peptides")
  ch<-charge(seq, pH = 7, pKscale = "EMBOSS")
  pp<-boman(seq) 
  ampI<-hmoment(seq, angle = 100, window = 11)
  hy<-hydrophobicity(seq, scale = "KyteDoolittle")
  pI<-pI(seq,pKscale = "EMBOSS")
  mw<-mw(seq, monoisotopic = FALSE)
  protana<-cbind(ch,pp,ampI,hy,pI,mw)
  colnames(protana)<-c("CHARGE","P-P","AMPIPHILLICITY","HYDRPHOBICITY","ISOELECTRIC","MOL WEIGHT")
  p<-cbind(seq,protana)
  colnames(p)<-c("AA","CHARGE","P-P","AMPIPHILLICITY","HYDRPHOBICITY","ISOELECTRIC","MOL WEIGHT")
  pr.p<-prcomp(protana)
  return(pr.p)}
#A function for protein analysis of different properties
protana<-function(seq){
  library("bio3d")
  library("Peptides")
  ch<-charge(seq, pH = 7, pKscale = "EMBOSS")
  pp<-boman(seq) 
  ampI<-hmoment(seq, angle = 100, window = 11)
  hy<-hydrophobicity(seq, scale = "KyteDoolittle")
  pI<-pI(seq,pKscale = "EMBOSS")
  mw<-mw(seq, monoisotopic = FALSE)
  protana<-cbind(ch,pp,ampI,hy,pI,mw)
  colnames(protana)<-c("CHARGE","P-P","AMPIPHILLICITY","HYDRPHOBICITY","ISOELECTRIC","MOL WEIGHT")
  p<-cbind(seq,protana)
  colnames(p)<-c("AA","CHARGE","P-P","AMPIPHILLICITY","HYDRPHOBICITY","ISOELECTRIC","MOL WEIGHT")
  return(p)}


#Membrane protein
mem<-read.pdb("4nyk.pdb")
##    PDB has ALT records, taking A only, rm.alt=TRUE
seq.mem<-pdbseq(mem)
mem.a<-protana(seq.mem)
pca.mem<-protanapca(seq.mem)
par(mfrow=c(1,1))
biplot(pca.mem,main="4NYK")

#Cytosolic protein
cyto.ibv<-read.pdb("3ibv.pdb")
seq.ibv<-pdbseq(cyto.ibv)
ibv.a<-protana(seq.ibv)
pca.ibv<-protanapca(seq.ibv)
pca.ibv
## Standard deviations (1, .., p=6):
## [1] 25.34836984  5.78031951  1.44318820  0.99829390  0.31081320  0.05188966
## 
## Rotation (n x k) = (6 x 6):
##                         PC1          PC2          PC3          PC4
## CHARGE         -0.001328563  0.006486049 -0.298910322 -0.084334098
## P-P            -0.044389433 -0.862420827 -0.048271204  0.484420200
## AMPIPHILLICITY -0.008684262  0.008019359 -0.175336332  0.247985461
## HYDRPHOBICITY   0.028413760  0.502999758 -0.110877881  0.822395087
## ISOELECTRIC    -0.009296199 -0.018256433 -0.930158122 -0.142804185
## MOL WEIGHT     -0.998528235  0.052743568  0.009573086  0.001151903
##                         PC5           PC6
## CHARGE          0.142428225  0.9397929272
## P-P             0.130632948  0.0142087580
## AMPIPHILLICITY -0.946338281  0.1098385800
## HYDRPHOBICITY   0.239905690 -0.0012563071
## ISOELECTRIC     0.097323488 -0.3232974705
## MOL WEIGHT      0.008154187  0.0001367825
biplot(pca.ibv,main="3IBV")

#CONCLUSION
#The membrane and cytosolic protein have nearly equal thermal stability
#The protein protein interaction parameter concludes that the membrane protein has higher binding potential for a protein of equal thermal stability
#The pca shows out of the six parameters chosen for analysis of this protein molecular weight is an outlier and has minimum relation to protein functionality whereas protein-protein interaction and hydrophobicity are inversely related 
#The other four factors such as charge,pI,Ampiphilicity and polarity are related factors for a protein irrespective of its origin
#Hence the a conclusive outcome for the above pca could be that for proteins of different origins is that hydrophobicity and protein protein interaction are inversely related irrespective of its origin
#The stastical tool used are pca,biplot and self made function protanapca(which gives the pca values of any protein sequence) and protana(which gives a matrix of all amino acid vs six parameters for any given protein)

project2

A webserver based bio3d application to explore molecular dynamics

Legand NMA FLUCTUATION

Legand NMA FLUCTUATION

Legand protein structure

Legand protein structure

PROJECT 3

REAL DATA ANALYSIS

#setwd("E:/kanta")
ali<-read.csv("Alipurduarr.csv")
ban<-read.csv("Bankura.csv")
bir<-read.csv("Birbhummm.csv")
bur<-read.csv("Burdwann.csv")
mal<-read.csv("Malda.csv")
coo<-read.csv("Coochbeharr.csv")
d.dina<-read.csv("D Dinajpur.csv")
e.med<-read.csv("E Medinipurr.csv")
how<-read.csv("Howrahh.csv")
jal<-read.csv("Jalpaiguri.csv")
kol<-read.csv("Kolkataa.csv")
mur<-read.csv("Murshidabadd.csv")
npgs<-read.csv("N 24 Pgs.csv")


alipur<-cbind(ali$AGE,ali$HB,ali$MCV,ali$MCH,ali$HBA0,ali$HBA2,ali$HBF)
distn<-rep("Alipurduarr",nrow(ali))
sex1<-as.character(ali$SEX)
alipura1<-cbind(alipur,distn,sex1)
summary(alipura1)
##        V1            V2            V3            V4            V5     
##  16     :143   11.3   : 43   72.3   : 13   22.9   : 26   218    : 29  
##  13     :134   11.6   : 40   70.5   : 12   21.7   : 22   228    : 27  
##  14     :120   11     : 39   71.3   : 12   21.8   : 20   225    : 26  
##  15     :115   10.7   : 35   71.8   : 11   20.2   : 19   226    : 26  
##  17     :115   10.8   : 35   72.9   : 11   20.9   : 19   217    : 25  
##  12     :111   12     : 35   (Other):993   22.7   : 19   221    : 24  
##  (Other):315   (Other):826   NA's   :  1   (Other):928   (Other):896  
##        V6            V7              distn      sex1   
##  0      :143   0      :272   Alipurduarr:1053    :  1  
##  2.8    :102   0.2    :193                      f:  9  
##  2.6    : 98   0.3    :149                      F:589  
##  2.7    : 97   0.4    : 78                      m:  2  
##  3      : 91   0.1    : 50                      M:452  
##  (Other):519   (Other):310                             
##  NA's   :  3   NA's   :  1
bankura<-cbind(ban$AGE,ban$HB,ban$HBF,ban$HBA2,ban$HBA0,ban$MCV,ban$MCH)
distn1<-rep("Bankura",nrow(ban))
sex<-as.character(ban$SEX)
bankura1<-cbind(bankura,distn1,sex)
summary(bankura1)
##        V1            V2            V3            V4            V5     
##  13     :138   12.1   : 53   0.2    :349   2.8    :174   88.3   : 61  
##  12     :132   11.2   : 52   0.3    :266   2.9    :158   87.8   : 52  
##  10     :101   12     : 52   0.4    :120   2.7    :149   88     : 52  
##  8      : 91   11.9   : 50   0      :106   2.6    :128   87.9   : 50  
##  15     : 86   11.3   : 48   0.1    : 90   3      :125   88.1   : 50  
##  (Other):602   11.7   : 47   0.5    : 78   3.1    : 88   87.5   : 48  
##  NA's   :  1   (Other):849   (Other):142   (Other):329   (Other):838  
##        V6             V7          distn1     sex    
##  72.7   :  17   21.6   : 31   Bankura:1151    :  1  
##  71.8   :  14   22.4   : 30                  F:458  
##  70.7   :  13   22.3   : 27                  M:692  
##  72.5   :  13   21.7   : 26                         
##  70.6   :  11   22     : 26                         
##  71.9   :  11   22.1   : 26                         
##  (Other):1072   (Other):985
birbhum<-cbind(bir$AGE,bir$HB,bir$HBF,bir$HBA0,bir$HBA2,bir$MCH,bir$MCV)
distn2<-rep("Birbhummm",nrow(bir))
sex3<-as.character(bir$SEX)
birbhuma1<-cbind(birbhum,distn2,sex3)
summary(birbhuma1)
##        V1             V2             V3             V4      
##  1      :1121   11.4   :  60   0.2    : 373   87.9   :  61  
##  5      : 207   11.5   :  52   0.3    : 237   87.5   :  54  
##  4      : 203   11.6   :  38   0.1    : 139   87.6   :  54  
##  6      : 137   11.8   :  36   0.4    :  77   87.8   :  52  
##  7      :  96   11.9   :  36   0.5    :  54   87     :  48  
##  3      :  82   (Other): 728   (Other):  70   (Other): 681  
##  (Other): 225   NA's   :1121   NA's   :1121   NA's   :1121  
##        V5             V6             V7             distn2     sex3     
##  2.9    : 140   20.5   :  29   69.7   :  22   Birbhummm:2071     :1124  
##  3      : 128   20.8   :  28   70.7   :  17                    F : 326  
##  3.2    : 121   21     :  28   77.6   :  16                    M : 618  
##  3.1    : 113   21.2   :  28   68.5   :  15                    M`:   3  
##  2.8    : 106   19.9   :  27   69.2   :  15                             
##  (Other): 342   (Other): 810   (Other): 865                             
##  NA's   :1121   NA's   :1121   NA's   :1121
burdwan<-cbind(bur$AGE,bur$HB,bur$HBF,bur$HBA0,bur$HBA2,bur$MCV,bur$MCH)
distn3<-rep("Burdann",nrow(bur))
sex4<-as.character(bur$SEX)
burdwana1<-cbind(burdwan,distn3,sex4)
summary(burdwana1)
##        V1            V2             V3            V4             V5     
##  5      :352   11     : 100   0.2    :956   88     : 136   2.6    :358  
##  4      :351   11.4   :  89   0.3    :430   87.8   : 125   2.7    :352  
##  6      :327   11.7   :  77   0.1    :424   87.9   : 113   2.8    :311  
##  3      :324   11.2   :  76   0.4    :215   87.7   : 112   2.5    :288  
##  2      :258   11.1   :  73   0.5    : 92   87.6   : 105   2.9    :209  
##  1      :188   11.5   :  72   (Other):139   (Other):1666   (Other):734  
##  (Other):459   (Other):1772   NA's   :  3   NA's   :   2   NA's   :  7  
##        V6             V7           distn3         sex4     
##  72.3   :  28   22     :  60   Burdann:2259   F     :  22  
##  72.5   :  21   22.1   :  48                  Female: 499  
##  71     :  20   21.5   :  45                  M     : 188  
##  71.2   :  19   21.4   :  44                  Male  :1549  
##  73.2   :  19   22.4   :  44                  Sex   :   1  
##  76     :  19   20.7   :  40                               
##  (Other):2133   (Other):1978
cooch<-cbind(coo$AGE,coo$HB,coo$HBF,coo$HBA0,coo$HBA2,coo$MCV,coo$MCH)
distn4<-rep("Coochbeharr",nrow(coo))
sex5<-as.character(coo$SEX)
coocha1<-cbind(cooch,distn4,sex5)
summary(coocha1)
##   V1          V2           V3           V4           V5          V6    
##  11:3   10.8   : 2   0      :13   84.5   : 3   3.4    :5   72.4   : 3  
##  12:3   11.2   : 2   0.2    : 3   84.6   : 3   3.2    :4   69.1   : 2  
##  13:9   11.5   : 2   0.4    : 3   83.4   : 2   3.3    :3   73.1   : 2  
##  14:7   11.6   : 2   0.3    : 2   84.2   : 2   3.5    :3   61.7   : 1  
##  15:3   12.6   : 2   0.1    : 1   25.4   : 1   3.6    :3   64.7   : 1  
##  16:2   12.9   : 2   0.6    : 1   5      : 1   3      :2   66.8   : 1  
##         (Other):15   (Other): 4   (Other):15   (Other):7   (Other):17  
##        V7             distn4   sex5  
##  20.2   : 3   Coochbeharr:27   M:27  
##  22.3   : 3                          
##  20.9   : 2                          
##  23.4   : 2                          
##  19     : 1                          
##  19.7   : 1                          
##  (Other):15
dinaj<-cbind(d.dina$AGE,d.dina$HBF,d.dina$HB,d.dina$HBA0,d.dina$HBA2,d.dina$MCV,d.dina$MCH)
distn5<-rep("D Dinajpur",nrow(d.dina))
sex6<-as.character(d.dina$SEX)
dinaja1<-cbind(dinaj,distn5,sex6)
summary(dinaja1)
##        V1            V2            V3             V4             V5     
##  13     :408   0.2    :772   10.2   : 183   88.1   : 117   2.9    :333  
##  14     :368   0.3    :558   10.8   : 171   87.8   : 114   2.8    :330  
##  12     :357   0.4    :223   11.2   : 154   88     : 109   2.7    :279  
##  15     :282   0.1    :194   11.6   : 152   88.3   : 103   3      :275  
##  11     :206   0.5    :119   11.9   : 129   87.7   : 102   2.6    :177  
##  16     :173   0.6    : 72   10.6   : 123   88.2   : 101   (Other):695  
##  (Other):296   (Other):152   (Other):1178   (Other):1444   NA's   :  1  
##        V6             V7              distn5     sex6    
##  80.2   :  85   21.9   : 137   D Dinajpur:2090   F: 694  
##  81.2   :  73   20.8   : 125                     M:1396  
##  78.2   :  66   21.3   :  94                             
##  78.6   :  63   22.8   :  94                             
##  76.2   :  53   21.5   :  84                             
##  69.8   :  45   22.4   :  84                             
##  (Other):1705   (Other):1472
e.medini<-cbind(e.med$AGE,e.med$HB,e.med$HBF,e.med$HBA0,e.med$HBA2,e.med$MCV,e.med$MCH)
distn6<-rep("E medinipurr",nrow(e.med))
sex7<-as.character(e.med$SEX)
emedini1<-cbind(e.medini,distn6,sex7)
summary(emedini1)
##        V1            V2            V3            V4            V5     
##  12     :153   12.8   : 32   0.5    :204   82.7   : 37   2.6    : 89  
##  13     :135   12.7   : 29   0.6    :134   82.2   : 35   2.7    : 88  
##  14     :116   12.6   : 28   0.4    :117   82.4   : 30   2.5    : 79  
##  15     : 82   11.4   : 24   0.2    : 82   82.3   : 29   2.8    : 74  
##  11     : 80   12.9   : 24   0.3    : 73   82.5   : 29   2.4    : 62  
##  16     : 66   11.9   : 23   0.7    : 47   81.9   : 28   (Other):357  
##  (Other):120   (Other):592   (Other): 95   (Other):564   NA's   :  3  
##        V6            V7               distn6        sex7    
##  65     : 13   22.1   : 24   E medinipurr:752   Female:123  
##  65.5   : 12   22.6   : 22                      Male  :629  
##  70.6   : 11   21.8   : 19                                  
##  73.8   : 11   22.8   : 19                                  
##  63.7   : 10   22     : 18                                  
##  63.2   :  9   22.2   : 18                                  
##  (Other):686   (Other):632
howrah<-cbind(how$AGE,how$HB,how$MCV,how$MCH,how$HBA0,how$HBA2,how$HBF)
distn8<-rep("Howarhh",nrow(how))
sex8<-as.character(how$SEX)
howrah1<-cbind(howrah,distn8,sex8)
summary(howrah1)
##   V1           V2           V3           V4           V5           V6   
##  11:10   12.4   : 4   71.2   : 2   26.4   : 3   87.2   : 4   2.6    :8  
##  12: 9   11.7   : 3   73.2   : 2   21.2   : 2   88     : 4   2.7    :7  
##  13: 4   13.4   : 3   88.1   : 2   21.3   : 2   87     : 3   2.9    :7  
##  14:12   10.6   : 2   62.9   : 1   22.1   : 2   87.4   : 3   3.1    :7  
##  15: 6   11     : 2   67.9   : 1   22.5   : 2   87.6   : 3   2.8    :4  
##  16: 2   11.2   : 2   68.9   : 1   22.6   : 2   88.2   : 3   2.5    :3  
##  17: 2   (Other):29   (Other):36   (Other):32   (Other):25   (Other):9  
##    V7         distn8   sex8  
##  0.1: 6   Howarhh:45   F: 8  
##  0.2:16                M:37  
##  0.3:15                      
##  0.4: 2                      
##  0.5: 2                      
##  0.6: 3                      
##  0.8: 1
kolkata<-cbind(kol$AGE,kol$HB,kol$MCV,kol$MCH,kol$HBA0,kol$HBA2,kol$HBF)
distn10<-rep("kolkataa",nrow(kol))
sex10<-as.character(kol$SEX)
kolkata1<-cbind(kolkata,distn10,sex10)
summary(kolkata1)
##        V1           V2           V3           V4           V5    
##  20     :24   13.3   : 4   85.3   : 4   22.5   : 3   88.8   :10  
##  21     :18   13.5   : 4   71.1   : 2   26.4   : 3   88.2   : 8  
##  22     :17   14.6   : 4   71.3   : 2   26.7   : 3   88.9   : 8  
##  19     :12   14.7   : 4   71.5   : 2   27.1   : 3   88.5   : 7  
##  18     :10   10.9   : 3   76     : 2   27.6   : 3   88.6   : 7  
##  23     : 7   11.8   : 3   79.1   : 2   28.6   : 3   89     : 5  
##  (Other): 8   (Other):74   (Other):82   (Other):78   (Other):51  
##        V6           V7         distn10   sex10 
##  2.6    :17   0.1    :42   kolkataa:96   F:21  
##  2.4    :13   0.2    :30                 M:75  
##  2.5    :12   0.3    :11                       
##  2.7    :12   0.4    : 6                       
##  2.3    :10   0.5    : 2                       
##  2.9    : 7   0.6    : 1                       
##  (Other):25   (Other): 4
malda<-cbind(mal$AGE,mal$HB,mal$MCV,mal$MCH,mal$HBA0,mal$HBA2,mal$HBF)
distn11<-rep("Malada",nrow(mal))
sex11<-as.character(mal$SEX)
malda1<-cbind(malda,distn11,sex11)
summary(malda1)
##        V1            V2            V3            V4            V5     
##  14     :156   11.7   : 71   81.6   : 50   25.1   : 63   86.9   : 36  
##  12     :154   11     : 58   81.4   : 47   24.9   : 57   87.2   : 32  
##  13     :146   11.9   : 58   80     : 44   25.6   : 54   87.4   : 25  
##  15     :130   10.4   : 51   81.7   : 41   25.3   : 51   87.5   : 25  
##  11     : 96   12     : 50   81     : 35   24.6   : 45   87.1   : 24  
##  (Other):450   10.7   : 49   81.9   : 33   24.7   : 42   81.7   : 23  
##  NA's   :  1   (Other):796   (Other):883   (Other):821   (Other):968  
##        V6            V7        distn11        sex11    
##  2.7    :145   0.3    :336   Malada:1133   F     :224  
##  2.6    :144   0.4    :231                 Female:236  
##  2.5    :143   0.2    :214                 M     :455  
##  2.9    :102   0.5    :105                 Male  :218  
##  2.8    :101   0.6    : 68                             
##  2.4    : 83   (Other):178                             
##  (Other):415   NA's   :  1
murshi<-cbind(mur$AGE,mur$HB,mur$MCV,mur$MCH,mur$HBA0,mur$HBA2,mur$HBF)
distn12<-rep("Murshidabadd",nrow(mur))
sex12<-as.character(mur$SEX)
murshi1<-cbind(murshi,distn12,sex12)
summary(murshi1)
##        V1            V2            V3            V4            V5     
##  13     : 49   11     : 15   80     : 10   24     : 11   86.8   : 15  
##  14     : 44   10.2   : 14   82.9   :  7   25.3   : 11   87     : 13  
##  15     : 44   10.7   : 12   81.6   :  6   25.9   :  8   87.1   : 13  
##  12     : 40   13.9   : 12   81.7   :  6   21.1   :  7   87.3   : 13  
##  16     : 39   10     : 11   82.5   :  6   25.1   :  7   87.5   : 13  
##  11     : 36   11.1   : 10   90.4   :  5   25.4   :  7   87.8   : 12  
##  (Other):108   (Other):286   (Other):320   (Other):309   (Other):281  
##        V6            V7              distn12    sex12  
##  2.7    : 49   0.3    :100   Murshidabadd:360   F: 81  
##  3      : 43   0.2    : 97                      M:279  
##  2.9    : 40   0.4    : 54                             
##  2.6    : 38   0.5    : 27                             
##  2.8    : 35   0.9    : 14                             
##  3.1    : 32   (Other): 67                             
##  (Other):123   NA's   :  1
Npgs<-cbind(npgs$AGE,npgs$HB,npgs$MCV,npgs$MCH,npgs$HBA0,npgs$HBA2,npgs$HBF)
distn13<-rep("N 24 Pgs",nrow(npgs))
sex13<-as.character(npgs$SEX)
npgs1<-cbind(Npgs,distn13,sex13)
summary(npgs1)
##        V1           V2            V3            V4            V5     
##  13     :38   11.9   : 14   67.2   :  5   21.3   : 10   87.7   : 17  
##  11     :29   11.1   : 13   69.1   :  4   21.2   :  8   87.5   : 15  
##  12     :28   11.6   : 13   69.2   :  4   21.5   :  7   87.3   : 13  
##  10     :23   11.7   : 12   69.5   :  4   21.7   :  7   87.8   : 13  
##  8      :21   11     : 10   70.5   :  4   22.2   :  7   87.6   : 12  
##  9      :21   11.8   :  9   76.7   :  4   21.4   :  6   87.4   : 11  
##  (Other):62   (Other):151   (Other):197   (Other):177   (Other):141  
##        V6           V7         distn13       sex13    
##  2.7    :51   0.2    :82   N 24 Pgs:222   F     : 25  
##  2.8    :39   0.3    :41                  Female:  3  
##  2.6    :28   0.1    :38                  M     :194  
##  2.5    :21   0.4    :19                              
##  2.9    :20   0.5    :17                              
##  3      :16   0.6    : 9                              
##  (Other):47   (Other):16
master<-rbind(alipura1,bankura1,birbhuma1,burdwana1,coocha1,dinaja1,emedini1,howrah1,kolkata1,malda1,murshi1,npgs1)
master<-data.frame(master)
master.f<-na.omit(master)
master1<-rbind(alipur,burdwan,bankura,birbhum,cooch,dinaj,e.medini,howrah,kolkata,malda,murshi,Npgs)
colnames(master1)<-c("age","hb","hbf","hba2","hba0","mcv","mch")
master1.f<-na.omit(master1)
pca<-prcomp(master1.f,scale= TRUE)
biplot(pca)

g_class<- master.f$sex
library("ggbiplot")
g<-ggbiplot(pca,obs.scale = 0.3,var.scale = 0.3,groups = g_class,ellipse = TRUE,circle = TRUE,pc.biplot = TRUE)
g<-g+theme(legend.direction = 'horizontal',legend.position = 'top')
print(g)

library("ggbiplot")
g<-ggbiplot(pca,obs.scale = 0.3,var.scale = 0.3,groups = g_class,ellipse = TRUE,circle = TRUE,pc.biplot = TRUE)
g<-g+xlim(-2.5,2.5)
g<-g+ylim(-2.5,2.5)
g<-g+theme(legend.direction = 'horizontal',legend.position = 'top')
print(g)
## Warning: Removed 24 rows containing missing values (geom_point).

g_class1<-master.f$distn
library("ggbiplot")
g<-ggbiplot(pca,obs.scale = 0.3,var.scale = 0.3,groups = g_class1,ellipse = TRUE,circle = TRUE,pc.biplot = TRUE)
g<-g+xlim(-2.5,2.5)
g<-g+ylim(-2.5,2.5)
g<-g+theme(legend.direction = 'horizontal',legend.position = 'top')
print(g)
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_path).

CONCLUSION

  • The biplot shows that mcv, mch and hb are very much related to each other.Hence they are contribiting factors.
  • Age has very less co-relation to all other components and does not directly affects any component to a large extent.
  • Hbf is likely to be a independent component of the analysis.
  • From the ggbiplot which seperates components on the distinction of male and female patients the components seem to be likely be the same as
  • the biplot without much variance except the fact that the female popuplation have a high correlation of mcv to mch than male.
  • Also the age becomes more important in case of females with aspect of all the components correlation.
  • The most distinct factor here are the outliers that are found which may be either or consisderd as very special scenarios #having very rare conditions for both the male and female conditions.
  • Further this project could be extended to focus on finding this outliers or rare clinically important conditions because they are purely independent of each other.
  • For the time being this project concludes with the fact that mcv,mch and hb are dependent variables and hbf is the most independent variable component.
  • Also age and sex(M/F) are moderately significant for the other components except females have a tendency to have more mcv and mch with increase in age as suggested from the ggbiplot.