#1) Simulating a genome:

x<-c('a','g','c','t')
y<-rep(0,1000)
y<-matrix(y,nrow=20)
for(i in 1:20){
y[i,]<-sample(x,50,replace=TRUE)
}
y[14,]
##  [1] "t" "g" "c" "c" "a" "t" "t" "g" "t" "g" "c" "t" "a" "a" "c" "t" "a" "g" "a"
## [20] "g" "a" "t" "a" "a" "t" "a" "a" "c" "c" "g" "t" "g" "g" "g" "a" "g" "t" "a"
## [39] "g" "a" "a" "a" "t" "t" "a" "t" "g" "c" "c" "t"

#2)

honor<-"Aggies do not lie cheat or steal nor tolerate those that do"
honor
## [1] "Aggies do not lie cheat or steal nor tolerate those that do"
#separate the honor code into individual characters
hchar<-strsplit(honor,"")
hchar
## [[1]]
##  [1] "A" "g" "g" "i" "e" "s" " " "d" "o" " " "n" "o" "t" " " "l" "i" "e" " " "c"
## [20] "h" "e" "a" "t" " " "o" "r" " " "s" "t" "e" "a" "l" " " "n" "o" "r" " " "t"
## [39] "o" "l" "e" "r" "a" "t" "e" " " "t" "h" "o" "s" "e" " " "t" "h" "a" "t" " "
## [58] "d" "o"
#list the number of occurrences of each character
table(hchar)
## hchar
##     a  A  c  d  e  g  h  i  l  n  o  r  s  t 
## 11  4  1  1  2  7  2  3  2  3  2  7  3  3  8

#3a) Looking at the human gene with HGNC symbol SPRR4.

mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
sprr4_cdna<-getSequence(id="SPRR4", type="hgnc_symbol", 
                   seqType="cdna", mart=mart)
#cDNA sequence:
sprr4_cdna
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              cdna
## 1 GGACAAGGGCTCAGCCTCCTCTCCTGGGGTCCAGCTTGTCGCCTCTGGCTCACCTGTTCCTAGAGCAATGTCTTCCCAGCAGCAGCAGCGGCAGCAGCAGCAGTGCCCACCCCAGAGGGCCCAGCAGCAGCAAGTGAAGCAGCCTTGTCAGCCACCCCCTGTTAAATGTCAAGAGACATGTGCACCCAAAACCAAGGATCCATGTGCTCCCCAGGTCAAGAAGCAATGCCCACCGAAAGGCACCATCATTCCAGCCCAGCAGAAGTGTCCCTCAGCCCAGCAAGCCTCCAAGAGCAAACAGAAGTAAGGATGGACTGGATATTACCATCATCCACCATCCTGGCTACCAGATGGAACCTTCTCTTCTTCCTTCTCCTCTTCCCTCCAGCTCTTGAGCCTACCCTCCTCTCACATCTCCTCCTGCCCAAGATGTAAGGAAGCATTGTAAGGATTTCTTCCCATCGTACCCTTCCCCACACATACCACCTTGGCTTCTTCTATATCCCACCCCGATGCTCTCCCAGGTGGGTGTGAGAGAGACCTCATTCTCTGCAGGCTCCAGCGTGGCCACAGCTAAGGCCCATCCATTTCCCAAAGTGAGGAAAGTGTCTGGGCTTCTTCTGGGGTTCCACCCTGACAAGTAGGGTCACAGAGGCTGGTGCACAGTTTCTGCCTCATTCCTCTCCATGATGCCCCCTGCTCTGGGCTTCTCTCCTGTTTTCCCCAATAAATATGTGCCTCATGTAATAAA
##   hgnc_symbol
## 1       SPRR4
sprr4_pep<-getSequence(id="SPRR4", type="hgnc_symbol", 
                        seqType="peptide", mart=mart)
#peptide sequence:
sprr4_pep
##                                                                            peptide
## 1 MSSQQQQRQQQQCPPQRAQQQQVKQPCQPPPVKCQETCAPKTKDPCAPQVKKQCPPKGTIIPAQQKCPSAQQASKSKQK*
##   hgnc_symbol
## 1       SPRR4
#3b) Entrez ID:
sprr4_id<-getBM(attributes=c("hgnc_symbol","entrezgene_id"),
                filters='hgnc_symbol',values="SPRR4", mart=mart)
sprr4_id
##   hgnc_symbol entrezgene_id
## 1       SPRR4        163778
#3c) GO Information:
sprr4_go<-mget("163778", org.Hs.egGO)
sprr4_go
## $`163778`
## $`163778`$`GO:0031424`
## $`163778`$`GO:0031424`$GOID
## [1] "GO:0031424"
## 
## $`163778`$`GO:0031424`$Evidence
## [1] "IEA"
## 
## $`163778`$`GO:0031424`$Ontology
## [1] "BP"
## 
## 
## $`163778`$`GO:0005938`
## $`163778`$`GO:0005938`$GOID
## [1] "GO:0005938"
## 
## $`163778`$`GO:0005938`$Evidence
## [1] "IEA"
## 
## $`163778`$`GO:0005938`$Ontology
## [1] "CC"

SPRR4 is known to be involved in the biological process of keratinization, which is when the epidermis is replaced with keratin to form a harder surface such as nails, hooves, claws, horns, etc. The SPRR4 protein is known to be located in the cell cortex.

#4a) The BRCA1 gene is relevant to breast cancer because it is a gene that helps to suppress cell growth. If the BRCA1 gene is mutated, then there is a higher risk of unregulated cell growth which leads to cancerous growth. This can occur in several areas of the body including the breasts.

#4b)

brca1_id<-getBM(attributes=c("hgnc_symbol","entrezgene_id"),
                filters='hgnc_symbol',values="BRCA1", mart=mart)
brca1_id
##   hgnc_symbol entrezgene_id
## 1       BRCA1           672
affy_entrez <- hgu133aENTREZID
mapped_probes <- mappedkeys(affy_entrez)
n<-length(mapped_probes)
affy<-as.data.frame(affy_entrez[mapped_probes[1:n]])
affy[which(affy[,2]=="672"), ]
##          probe_id gene_id
## 3966  204531_s_at     672
## 10709 211851_x_at     672

The probesets 204531_s_at and 211851_x_at correspond to the Entrz ID 672 which is BRCA1.

#4c) Pathways that BRCA1 is involved in
kegg_id<-keggGet("hsa:672")
kegg_id[[1]]$PATHWAY
##                         hsa01524                         hsa03440 
##       "Platinum drug resistance"       "Homologous recombination" 
##                         hsa03460                         hsa04120 
##         "Fanconi anemia pathway" "Ubiquitin mediated proteolysis" 
##                         hsa04151                         hsa05206 
##     "PI3K-Akt signaling pathway"            "MicroRNAs in cancer" 
##                         hsa05224 
##                  "Breast cancer"
#4d) Other genes involved in the same pathway
kegg_id<-mget("hsa:672", org.Hs.egSYMBOL, ifnotfound = NA)
kegg_id
## $`hsa:672`
## [1] NA
#I couldn't get this to work :(

#5) Phenotype breacher graphing

library("viridis")
hm450<- read.csv("~/Grad School/STAT 646/Phenotype_Breacher_HM450.csv")
omit1<-c("879","4667","5505","7725")
omit2<-c("879","4667","5505","7725","3F05_1_2","3F05_10_2","Leuk_ZC","Leuk_FH","3D03_10_2")
o1<-which(hm450$Sample_Name==omit1)
o2<-which(hm450$Sample_Name==omit2)
hm<-hm450[-c(o1,o2),]

#5a) Bar plot
pre<-hm[grep("_1$",hm$Sample_Name),]
pren<-ordered(pre$X..career..breaches,levels=c("0","1-9","10-39",
                        "40-99","100-199","200-399","400+"))
counts<-table(pren)
barplot(counts,xlab="categories", ylab="# breaches",
        main="# breaches bar plot")

#5b) Pie Chart
counts2<-table(ordered(pren<"40-99",levels=c("TRUE","FALSE")))
pct<-round(counts2*100/sum(counts2),3)
perc<-paste(pct,"%",sep="")
pie(counts2,perc,col=viridis(length(pct),begin=.4,end=.8),main="Pie Chart")
legend(1,1,c("<39",">39"),fill=viridis(length(pct),begin=.4,end=.8))

#5c) Two variable bar plot
tbi<-pre$hx.of.TBI
counts3<-table(tbi,pren)
barplot(counts3,xlab="# of breachers",col=viridis(length(pct),begin=.6,end=.9))
legend(6,12,c("TBI=yes","TBI=no"),fill=viridis(length(pct),begin=.9,end=.6))

#6) The central dogma of biology says that genetic information starts with DNA which undergoes transcription to produce RNA, and then the RNA undergoes translation to produce proteins. This flow of information can only go in one direction, DNA->RNA->Protein.

A chromosome is a structure carrying protein and DNA that has been organized into genes. It exists in the nucleus of a cell. DNA is a double-helix strand made up of four nucleotides that contains all the genetic information for an organism. A gene is a specific piece of a DNA strand, and it is what gets transribed to form RNA.