블로그 내부검색
[TIP] biomaRt
Labels:
Informatics
Email ThisBlogThis!Share to XShare to Facebook
http://biomart2010.wikispaces.com/
Installing biomaRt
-
source( "http://www.bioconductor.org/biocLite.R") biocLite(’biomaRt’)
Check Available BioMarts
-
library(biomaRt) listMarts()
Select the Ensembl BioMart
-
ensembl = useMart("ensembl") datasets = listDatasets(ensembl) ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl") attributes=listAttributes(ensembl) filters=listFilters(ensembl)
Task 1a
-
affyids = c("211550_at","202431_s_at","206044_s_at") annotation = getBM(c("affy_hg_u133_plus_2","ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position","band","strand"), filters="affy_hg_u133_plus_2", values=affyids,mart = ensembl) print(annotation)
Task 1aa
library(biomaRt)
mart = useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
refseq = c("NM_006945", "NM_152486", "NM_198317", "XM_001125684" )
#getBM(filters=c( "refseq_mrna", "refseq_mrna_predicted"), attributes="external_gene_id", values=refseq, mart=mart)
getBM(filters="refseq_mrna", attributes=c("refseq_mrna", "hgnc_symbol"), values=refseq, mart=mart)
listAttributes( mart)[grep( "refseq", listAttributes( mart)[,1]), ]
# coding
NM = getBM(filters="refseq_mrna", attributes=c("refseq_mrna", "hgnc_symbol"), values=refseq, mart=mart)
# coding - predict
XM = getBM(filters="refseq_mrna_predicted", attributes=c("refseq_mrna_predicted", "hgnc_symbol"), values=refseq, mart=mart )
# non-coding
NR = getBM(filters="refseq_ncrna" , attributes=c("refseq_ncrna", "hgnc_symbol"), values=refseq, mart=mart )
# non-coding-predicted
XR = getBM(filters="refseq_ncrna_predicted", attributes=c("refseq_ncrna_predicted", "hgnc_symbol"), values=refseq, mart=mart )
colnames( NM) = c("refseq", "hgnc")
colnames( XM) = c("refseq", "hgnc")
colnames( NR) = c("refseq", "hgnc")
colnames( XR) = c("refseq", "hgnc")
refseq2hgnc = rbind( NM, XM, NR, XR )
Task 1b
-
illuminaIDs = c("ILMN_1728071","ILMN_1662668") goAnnot = getBM(c("illumina_humanwg_6_v2", "go_biological_process_id","go_biological_process_linkage_type"), filters="illumina_humanwg_6_v2", values=illuminaIDs, mart = ensembl) print(goAnnot[1:5,])
Task 2
-
diab=getBM(c("ensembl_gene_id","hgnc_symbol"),filters=c("mim_morbid_accession","go"), values=list(c("125853","222100"),"GO:0003700"),mart=ensembl) print(diab)
Task3
-
miRNA = getBM(c("mirbase_id","ensembl_gene_id","start_position","chromosome_name"), filters=c("chromosome_name","with_mirbase"), values=list(13,TRUE), mart=ensembl) miRNA[1:5,]
Task4
-
filterOptions("snptype_filters",ensembl) entrez = getBM("entrezgene",filters=c("chromosome_name","snptype_filters"), values=list(22,"NON_SYNONYMOUS_CODING"),mart=ensembl) entrez[1:5,]
Task5
-
seq = getSequence(id="CDH1", type="hgnc_symbol",seqType="gene_exon", mart = ensembl) seq[1,]
Task6
-
promoter=getSequence(id=c("APC","CUL1"), type="hgnc_symbol",seqType="coding_gene_flank",upstream =2000, mart=ensembl)
Task7
-
human=useMart("ensembl", dataset="hsapiens_gene_ensembl") chicken=useMart("ensembl", dataset="ggallus_gene_ensembl") mapping = getLDS(attributes=c("affy_hg_u95av2","hgnc_symbol"), filters="affy_hg_u95av2", values=c("1888_s_at","1434_at"),mart=human,attributesL="affy_chicken", martL=chicken)
Accessing Archived Ensembl versions
-
listMarts(host="may2009.archive.ensembl.org/biomart/martservice/") ensembl54=useMart("ENSEMBL_MART_ENSEMBL", host="may2009.archive.ensembl.org/biomart/martservice/") listDatasets(ensembl54) ensembl54=useMart("ENSEMBL_MART_ENSEMBL", host="may2009.archive.ensembl.org/biomart/martservice/", dataset='hsapiens_gene_ensembl')
Task8
-
snp=useMart("snp", dataset="hsapiens_snp") out=getBM(attributes=c("refsnp_id","allele","chrom_start"), filters=c("chr_name","chrom_start","chrom_end"), values=list(8,148350,158612), mart=snp) out[1:5,]
Task9
-
hapmap = useMart("HapMap_rel27", dataset="hm27_variation_yri") yri = getBM(c("chrom","start","alleles","ref_allele","ref_allele_freq","other_allele_freq"),filters=c("chrom","coding_nonsynon"),values=list("chr19",TRUE),mart=hapmap) head(yri)
Task10
-
cosmic=useMart("CosmicMart",dataset="COSMIC47") mut = getBM(c("sample_name","gene_name","aa_mut_syntax","mut_type_cds","zygosity"),filters="sample_name",values=c("MCF7","BT474"),mart=cosmic) head(mut)
Task11
-
reactome = useMart("REACTOME", dataset="pathway") ids = getBM(c("referencedatabase_uniprot","_displayname"),filters=c("_displayname","species_selection"),value=list(c("DNA Repair","Signaling by WNT","Muscle contraction"),"Homo sapiens"), mart=reactome)
Task12
-
gramene = useMart("ENSEMBL_MART_ENSEMBL", dataset="athaliana_eg_gene") getBM(c("ensembl_gene_id","external_gene_id","description","start_position","end_position"), filters=c("chromosome_name","start","end"), values=list("1", "30000","41000"), mart=gramene)
Task13
-
worm = useMart("wormbase195", dataset="wormbase_rnai") pheno = getBM(c("rnai","phenotype_primary_name"), filters="gene", values="WBGene00006763", mart=worm) head(pheno)
Task14 : get gene infos
- gene.list = c( "MBNL3", "WNT5A", "NFIA" )
- gs = getBM(attributes=c("hgnc_symbol", "ensembl_gene_id", "chromosome_name", 'strand', "start_position", "end_position", "band"), filter=c("hgnc_symbol"), values=gene.list, mart=ensmart)
archive:
http://useast.ensembl.org/info/website/archives/index.html
## biomart = "ENSEMBL_MART_ENSEMBL" # use archive to get ensembl 65 ## biomaRt_dataset = "mmusculus_gene_ensembl" # mouse dataset id name ## host = "dec2011.archive.ensembl.org" # use ensembl 65 for annotation for mm9 ## goAnno = "org.Mm.eg.db" # GO annotation database ## mm10 mart = useMart('ENSEMBL_MART_ENSEMBL',dataset='mmusculus_gene_ensembl',host="may2012.archive.ensembl.org")
to get customized seqs
library(biomaRt) ensmart=useMart("ensembl",dataset="hsapiens_gene_ensembl" ) #, mysql=TRUE) gene = getGene(id='PITX2', type='hgnc_symbol', mart=ensmart) seq = getSequence(id="BRCA1", type="hgnc_symbol", seqType="peptide", mart = mart) show(seq) seq = getSequence(id="1939_at", type="affy_hg_u95av2", seqType="gene_flank",upstream = 20, mart = mart) show(seq) ## ## entrez ## entrez=c("673","7157","837") getSequence(id = entrez, type="entrezgene",seqType="coding_gene_flank",upstream=10, mart=ensmart) OR library(BSgenome.Hsapiens.UCSC.hg19) toString(subseq(Hsapiens$chr17, 1000000,1000005)) OR http://useast.ensembl.org/das/Homo_sapiens.GRCh37.reference/sequence?segment=1:100000,110000
(my gene_list is in gene symbol) >my_mart = useMart("ensembl",dataset="hsapiens_gene_ensembl") >seq_3utr = getSequence(id = unique(gene.symbol), type="hgnc_symbol",seqType="3utr",mart = my_mart) >seq_3utr = seq_3utr[seq_3utr[,"3utr"] != "Sequence unavailable",] >here: extract longest 3'UTR for each unique gene symbol >exportFASTA(seq_3utr, file=paste("s3utr.fa",sep="")) As my project goes, I now need 3'UTR genomic coordinates to get phastcons conservation for some regions in 3'UTR. To do that, I first convert hgnc_symbol back to ensembl_gene_id, then get 3'UTR coordinates using getBM like this: >s3utr = read.DNAStringSet(paste("s3utr.fa",sep=""),format="fasta") >gene_names = names(s3utr) >hgnc2ensembl = getBM(attributes=c("hgnc_symbol","ensembl_gene_id"), filters="hgnc_symbol", values=gene_names, mart=my_mart) >s3utr_pos = getBM(attributes=c("ensembl_gene_id", "chromosome_name","strand","3_utr_start", "3_utr_end"), filters="ensembl_gene_id", values=as.character(hgnc2ensembl $ensembl_gene_id), mart=my_mart) >s3utr_pos = s3utr_pos[complete.cases(s3utr_pos),] By doing that, now I can only get about 5000 gene symbols with 3'UTR coordinates (converting from hgnc_symbol back to ensembl_gene_id looses about 250 genes). I was thinking it might be version difference? So I tried to use ensembl archive but it gives me error as below: > my_mart = useMart("ensembl_mart_50",dataset="hsapiens_gene_ensembl",archive=T) Error in value[[3L]](cond) : Request to BioMart web service failed. Verify if you are still connected to the internet. Alternatively the BioMart web service is temporarily down. In addition: Warning message: In file(file, "r") : cannot open: HTTP status was '404 Not Found'
Retrieve genomic positions
#R --slave \< ../get-genomic-coordinate-using-biomaRt.R genelist output args = commandArgs() gene.list.file = args[3] output = args[4] #gene.list.file = "a" gene.list = read.table( file=gene.list.file, header=F, sep="\t" ) gene.list = as.vector( gene.list$V1 ) # V1 == hgnc_symbol #library(GenomeGraphs) library(biomaRt) #mart = useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl") #---------------- # match to hg19 / NCBI build 37 #---------------- #mart = useMart( biomart="ensembl", dataset = "hsapiens_gene_ensembl") ###---------------- ### match to hg18 / NCBI build 36 ###---------------- mart = useMart( biomart="ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "may2009.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) # to use the hg18, NCBI build 36 gs = getBM(attributes=c("hgnc_symbol", "ensembl_gene_id", "chromosome_name", 'strand', "start_position", "end_position", "band"), filter=c("hgnc_symbol"), values=gene.list, mart =mart) #gs = getBM(attributes=c("hgnc_symbol", "ensembl_gene_id", "chromosome_name", "start_position", "end_position", "band"), filter=c("hgnc_symbol"), values=gene_symbol, mart=mart) #att1 = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", "strand", "start_position", "end_position", "transcript_start", "transcript_end", "band") #att2 = c("ensembl_gene_id", "5_utr_start", "5_utr_end","3_utr_start", "3_utr_end", "exon_chrom_start", "exon_chrom_end") #gs1 = getBM(mart=mart, attributes=att1, value=c("hgnc_symbol") ) #, values=gene_symbol) #gs2 = getBM(mart=mart, attributes=att2 ) #, values=gene_symbol) #gs= merge(gs1,gs2) #gs = unique( gs ) #gene.list = c( "EIF4ENIF1" ) out = NULL notMatch = NULL for( symbol in gene.list ) { ## this can be multi-lines info = gs[gs$hgnc_symbol == symbol,][,c(1,3:6)] if( length(info) == 0 ) { notMatch = c( notMatch, symbol ) } else { chr = as.vector( info$chromosome_name ) out = rbind( out, info[grep( "^[0-9XY]", chr ),] ) } } print( out ) #out[ out$chromosome_name == "X",]$chromosome_name = "23" #out[ out$chromosome_name == "Y",]$chromosome_name = "24" write.table( out, file=output, row.names=F, col.names=F, sep="\t", quot=F )
to get the attributes from multiple pages of the same mart.
library("biomaRt") mart = useMart("ensembl") ensembl = useDataset("hsapiens_gene_ensembl", mart = mart) p = c("200641_s_at" , "229411_at" , "223376_s_at") Q1 = getBM(attributes = c("affy_hg_u133_plus_2","entrezgene","ensembl_gene_id"), filters = "affy_hg_u133_plus_2", mart=ensembl, values=p) Q2 = getBM(attributes=c("validated","ensembl_gene_id"), filters="affy_hg_u133_plus_2", mart=ensembl, values=p) ## clean up: ## Q1 = subset(Q1, !is.na(entrezgene)) Q = merge(Q1,Q2)SNPs
library("biomaRt") snp = useMart("snp", dataset="hsapiens_snp") ensemblids = c("ENSG00000204296") out = getBM(attributes=c("refsnp_id","chr_name","chrom_start", "ensembl_gene_stable_id","validated", "consequence_type_tv"), filters=c("ensembl_gene"), values=c(ensemblids), mart=snp) head(out) refsnp_id chr_name chrom_start ensembl_gene_stable_id validated 1 rs517922 6 32258836 ENSG00000204296 hapmap 2 rs3117133 6 32313653 ENSG00000204296 cluster,freq,1000Genome 3 rs6621681 6 32292217 ENSG00000204296 4 rs6621681 6 32292217 ENSG00000204296 5 rs6621682 6 32292221 ENSG00000204296 6 rs6621682 6 32292221 ENSG00000204296 consequence_type_tv 1 DOWNSTREAM 2 INTRONIC 3 INTRONIC 4 NON_SYNONYMOUS_CODING 5 INTRONIC 6 SYNONYMOUS_CODING
Search TFBS agains promoter regions (~up/dn2K from TSS)
IUPAC Code Meaning Complement A A T C C G G G C T/U T A M A or C K R A or G Y W A or T W S C or G S Y C or T R K G or T M V A or C or G B H A or C or T D D A or G or T H B C or G or T V N G or A or T or C N [kwangmin@10 MSigDB-v3.0]$ grep PITX2 c3.tft.v3.0.symbols.gmt.morifSeq.txt V$PITX2_Q2 CTCNANGTGNY GGATTA_V$PITX2_Q2 YATGNWAAT [kwangmin@10 MSigDB-v3.0]$ grep SOX9 c3.tft.v3.0.symbols.gmt.morifSeq.txt V$SOX9_B1 SKYTAAAAATAACYCH CATTGTYY_V$SOX9_B1 YATGNWAAT [kwangmin@10 MSigDB-v3.0]$ grep PITX2_Q2 c3.tft.v3.0.symbols.gmt library( biomaRt ) library(BSgenome.Hsapiens.UCSC.hg19) #ensmart = useMart( biomart="E", dataset = "hsapiens_gene_ensembl" ) ensmart = useMart( biomart="ensembl", dataset="hsapiens_gene_ensembl") system( "awk -F\"\t\" '{ if($4 > 2.5 && $5 > 2.5) print $2}' ~/Desktop/shSOX9_shPITX2-vs-shControl-BH5pct-FC1.csv | sort | uniq > up" ) system( "awk -F\"\t\" '{ if($4 < 0.5 && $5 < 0.5) print $2}' ~/Desktop/shSOX9_shPITX2-vs-shControl-BH5pct-FC1.csv | sort | uniq > dn" ) #gene.list = c( "SOX9", "PITX2", "RORA" ) #gene.list = as.vector( read.table( file="up")$V1 ) gene.list = as.vector( read.table( file="dn")$V1 ) #gs = getBM(attributes=c("hgnc_symbol", "ensembl_gene_id", "chromosome_name", 'strand', "start_position", "end_position", "band"), filter=c("hgnc_symbol"), values=gene.list, mart=ensmart)
## ---> not gene start/end, but transcript_start/end for TSS.
## use only complete chromosome ## gs = gs[ grep( "^[0-9|X|Y]", gs$chromosome_name, perl=T), ] g.sym = gs$hgnc_symbol g.str = gs$strand g.start = gs$start_position g.end = gs$end_position g.chr = paste( "chr", gs$chromosome, sep="") ## ## getSequence is cool for get up2k, but not dn2k. ## ## getSequence(id=gene.list, type="hgnc_symbol", seqType="coding_gene_flank", upstream = 20, mart = ensmart) ## ## up-stream from TSS ## promoter = 2000 g.seq.up = NULL g.seqs.up = NULL for( i in 1:length(g.sym) ) { if( g.str[i] == 1 ) { g.seq.up = toString(subseq(Hsapiens[[g.chr[i]]], g.start[i]-promoter, g.start[i] )) g.seqs.up = c( g.seqs.up, g.seq.up ) } else if( g.str[i] == -1 ) { g.seq.up = toString( reverseComplement( subseq(Hsapiens[[g.chr[i]]], g.end[i], g.end[i]+promoter)) ) g.seqs.up = c( g.seqs.up, g.seq.up ) } } ## ## down-stream from TSS ## g.seq.dn = NULL g.seqs.dn = NULL for( i in 1:length(g.sym) ) { if( g.str[i] == 1 ) { g.seq.dn = toString(subseq(Hsapiens[[g.chr[i]]], g.start[i], g.start[i]+promoter )) g.seqs.dn = c( g.seqs.dn, g.seq.dn ) } else if( g.str[i] == -1 ) { g.seq.dn = toString( reverseComplement( subseq(Hsapiens[[g.chr[i]]], g.end[i]-promoter, g.end[i])) ) g.seqs.dn = c( g.seqs.dn, g.seq.dn ) } } search.table = as.data.frame( cbind( g.sym, g.chr, g.str, g.seqs.up, g.seqs.dn ) ) #patterns = c( 'AAAAAA' ) #matchPattern( DNAString("G[:upper:]G"), DNAString( search.table$g.seqs.up[1]) , fixed=FALSE) # 3 hits on strand + #[kwangmin@10 MSigDB-v3.0]$ grep PITX2 c3.tft.v3.0.symbols.gmt.morifSeq.txt #V$PITX2_Q2 CTCNANGTGNY #GGATTA_V$PITX2_Q2 YATGNWAAT #[kwangmin@10 MSigDB-v3.0]$ grep SOX9 c3.tft.v3.0.symbols.gmt.morifSeq.txt #V$SOX9_B1 SKYTAAAAATAACYCH #CATTGTYY_V$SOX9_B1 YATGNWAAT ## ## for detailed match position, use matchPattern() ## # matchPattern( DNAString("AAAMCT"), DNAString( search.table$g.seqs.up[1]) , fixed=F) # IUPAC 3 hits on strand ## ## IUPAC ambiguity codes ## pattern = c( "[A|T][A|T]CAA[A|T]G", "CTCANGTGNY", "YATGNWAAT", "SKYTAAAAATAACYCH" ) ## ## converted into regular expression for grep( perl=T) ## pattern = c( "[A|T][A|T]CAA[A|T]G", "CTCA[G|A|T|C]GTG[G|A|T|C][C|T]", "[C|T]ATG[G|A|T|C][A|T]AAT", "[C|G][G|T][C|T]TAAAAATAAC[C|T]C[A|C]" ) up = NULL dn = NULL upPromPatt = NULL dnPromPatt = NULL upProm = NULL dnProm = NULL for( i in 1:length(pattern)) { promoter.up = grep( pattern[i], search.table$g.seqs.up, perl=T) if( length(promoter.up) > 0) { for( j in promoter.up) { upPromPatt = c( upPromPatt, pattern[i]) upProm = c( upProm, as.vector(search.table$g.sym[j]) ) #print( upProm ) } up = rbind( up, cbind( upPromPatt, upProm ) ) upPromPatt = NULL upProm = NULL } promoter.dn = grep( pattern[i], search.table$g.seqs.dn, perl=T) if( length(promoter.dn) > 0 ) { for( k in promoter.dn) { dnPromPatt = c( dnPromPatt, pattern[i]) #print( dnPromPatt) dnProm = c( dnProm, as.vector(search.table$g.sym[k]) ) } dn = rbind( dn, cbind( dnPromPatt, dnProm ) ) dnPromPatt = NULL dnProm = NULL } } #write.table( up, file="shSOX9-and-shPITX2-both-2.5xUP-up2K-search.xls", sep="\t", row.names=F, quote=F) #write.table( dn, file="shSOX9-and-shPITX2-both-2.5xUP-dn2K-search.xls", sep="\t", row.names=F, quote=F) write.table( up, file="shSOX9-and-shPITX2-both-0.5xDN-up2K-search.xls", sep="\t", row.names=F, quote=F) write.table( dn, file="shSOX9-and-shPITX2-both-0.5xDN-dn2K-search.xls", sep="\t", row.names=F, quote=F) q()
bsgenome.Hsapiens.UCSC package
Examples http://svitsrv25.epfl.ch/R-doc/library/Biostrings/html/reverse.html
Get Exon seq from ID
t = read.csv( file=file, sep="\t", header=T) exon.id = as.character(t[,28] ) #exon.id = exon.id[1:4] #exon.id[5] = "jlkfljs" exon.id.all = unlist( strsplit( exon.id, "[|]", perl=T) ) # listDatabasets( e65 ) # listFilters( e65 ) # listAttributes( e65 ) myMart = getBM( attributes=c("ensembl_exon_id","gene_exon"), mart=e65, filter=c("ensembl_exon_id"), value=exon.id.all ) exon.seqs = NULL for( exon in exon.id ) { array = unlist( strsplit( exon, "[|]", perl=T) ) if( length(array) == 0 ) { exon.seqs = c( exon.seqs, " | -" ) } else { concat = NULL for( id in array ) { seq = myMart[ grep( paste("^",id,"$",sep=""), myMart$ensembl_exon_id), ]$gene_exon if( length(seq) == 0 ) { seq = "-" } concat = paste( concat, seq, sep=" | " ) } exon.seqs = c( exon.seqs, concat ) } } t$exon.seqs = exon.seqs
Email ThisBlogThis!Share to XShare to Facebook
Labels: Informatics
Scientist. Husband. Daddy. --- TOLLE. LEGE
외부자료의 인용에 있어 대한민국 저작권법(28조)과 U.S. Copyright Act (17 USC. §107)에 정의된 "저작권물의 공정한 이용원칙 | the U.S. fair use doctrine" 을 따릅니다. 저작권(© 최광민)이 명시된 모든 글과 번역문들에 대해 (1) 복제-배포, (2) 임의수정 및 자의적 본문 발췌, (3) 무단배포를 위한 화면캡처를 금하며, (4) 인용 시 URL 주소 만을 사용할 수 있습니다. [후원 | 운영] [대문으로] [방명록] [티스토리 (백업)] [신시내티]
