标签归档:R

Prepare a data frame for sample CNV data

If we want to cluster samples based on CNV data, a dataframe is needed. However, CNV segments in each sample are not the same. Maybe overlap or distinct. I think CNTools package migh solve this challenge. An example is shown as below. The result is a reduced segment data frame.

BiocManager::install("CNTools")
data("sampleData")
seg <- CNSeg(sampleData)
rdseg <- getRS(seg, by = "region", imput = FALSE, XY = FALSE, what = "mean") 
View(rdseg@rs)

Input dataframe has six columns (“ID”,”chrom”,”loc.start”,”loc.end”,”num.mark”,”seg.mean”) including 277 samples and 54825 segments.

The result can be got from rdseg@rs, like this

Cheers

Also, we can use CNRegions from iClusterPlus package.
CNregions(sampleData)

Ref: https://www.rdocumentation.org/packages/CNTools

https://rdrr.io/bioc/iClusterPlus/man/CNregions.html

#####################################################################
#版权所有 转载请告知 版权归作者所有 如有侵权 一经发现 必将追究其法律责
#Author: Jason
#####################################################################

FPKM转TPM

R code

fpkm2tpm = function(fpkm){
  exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
tpm = apply(expMatrix, 2, fpkm2tpm)

If the expression matrix has NA value

fpkm2tpm <- function(fpkm){
  tpm <- exp(log(fpkm) - log(sum(fpkm,na.rm=T)) + log(1e6))
  tpm[which(is.na(tpm))] <- 0
  return(tpm)
}

TPMi=( FPKMi / sum(FPKMj ) * 10^6

TCGABiolinks下载TCGA数据做生存分析

以前的工作是全基因组或全外分析,不涉及癌症和生存分析,但现在的工作主要围绕癌症方面,生存分析一定少不了。实验室小伙伴推荐用TCGAbiolinks下载TCGA的数据,于是研究了如何用TCGABiolinks下载TCGA的数据,以下载RNA的count数据为例,并做生存分析。

# 安装相关的包
if (!requireNamespace("BiocManager", quietly=TRUE))   
  install.packages("BiocManager")
BiocManager::install("TCGAbiolinks")
BiocManager::install("SummarizedExperiment")
install.packages('survival')
install.packages('survminer')
library(SummarizedExperiment)
library(TCGAbiolinks)
library(survival)
library(survminer)

# 项目的概括信息,project的名称可以从 https://portal.gdc.cancer.gov/ 上面选择对应的器官,进入之后左侧列表中就会显示。我们以下载TCGA-LIHC项目的数据为例。
TCGAbiolinks:::getProjectSummary("TCGA-LIHC") 
# 获取该项目样本的临床信息
clinical <- GDCquery_clinic(project = "TCGA-LIHC", type = "clinical")

# 下载TCGA-LIHC项目的rna-seq的counts数据,构建GDCquery,具体的数据类型的写法可以参考 https://portal.gdc.cancer.gov/repository 左侧列表
# 比如data type有Raw Simple Somatic Mutation, Annotated Somatic Mutation, Aligned Reads, Gene Expression Quantification, Slide Image这几种,不过不同的项目,有可能有不同的数据类型。
query <- GDCquery(project = "TCGA-LIHC", 
		    experimental.strategy = "RNA-Seq",
                    data.category = "Transcriptome Profiling", 
                    data.type = "Gene Expression Quantification",
		    workflow.type = "HTSeq - Counts")
# 下载数据
GDCdownload(query)
# 导入数据
LIHCRnaseq <- GDCprepare(query)
# 得到样本的基因counts矩阵,每行为一个基因,每列为一个样本
count_matrix <- assay(LIHCRnaseq)

# TCGA样本的编号以'-'分割,前三列是患者编号,第四列是类型,一般11表示正常样本,01表示肿瘤样本,见https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
samplesNT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
# selection of tumor samples "TP"
samplesTP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))

# 根据ensembl id选择特定基因在癌症样本中的表达
gexp <- count_matrix[c("ENSG00000198431"),samplesTP]

# 这样选出的样本都是肿瘤样本,但样本名称是TCGA-CC-A7IL-01A-11R-A33R-07,与clinical信息不对应,需要将名字简化成TCGA-CC-A7IL
names(gexp) <-  sapply(strsplit(names(gexp),'-'),function(x) paste0(x[1:3],collapse="-"))

# 将表达数据和clinical数据合并,并且挑出来用于做生存分析的数据
clinical$"ENSG00000198431" <- gexp[clinical$submitter_id]
df<-subset(clinical,select =c(submitter_id,vital_status,days_to_death,ENSG00000198431))

#去掉NA
df <- df[!is.na(df$ENSG00000198431),]

#取基因的平均值,大于平均值的为H,小于为L
df$exp <- ''
df[df$ENSG00000198431 >= mean(df$ENSG00000198431),]$exp <- "H"
df[df$ENSG00000198431 <  mean(df$ENSG00000198431),]$exp <- "L"

# 将status表示患者结局,1表示删失,2表示死亡
df[df$vital_status=='Dead',]$vital_status <- 2
df[df$vital_status=='Alive',]$vital_status <- 1
df$vital_status <- as.numeric(df$vital_status)

# 建模
fit <- survfit(Surv(days_to_death, vital_status)~exp, data=df) # 根据表达建模
# 显示P value
surv_pvalue(fit)$pval.txt 
# 画图
ggsurvplot(fit,pval=TRUE)

继续阅读

topGO包进行-GO富集分析-做有向无环图

GOdata = new(“topGOdata”, ontology = “MF”, allGenes = geneList,annot = annFUN.gene2GO, gene2GO = geneID2GO)

利用topGO进行分析,最重要的是构建topGO对象,构建topGO需要两个参数:

1,topGO需要基因和GO号的对应关系

2,基因列表,用来标记背景基因(所有基因)及差异基因

一,获取ensembl和GO号的对应关系: geneID2GO

如果你有现成的gene id和go id的对应关系,文件格式为

gene_ID制表符GO_ID1, GO_ID2, GO_ID3, ...

每行,则可以利用readMappings读取该文件(topGO包里面的函数)。

而我有一个cuffdiff的结果文件gene_exp.diff,用的是ensembl的注释GFF文件。首先需要获得ensembl和GO的对应关系,这里利用biomaRt包(前面的文章中有讲,这里不详细介绍)。

library(biomaRt)
genes = useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl")
# 得到go信息和gene
gene2goInfo <- getBM(attributes=c('ensembl_gene_id','go_id','entrezgene','name_1006','go_linkage_type','namespace_1003'), mart = genes)
# 过滤
gene2goInfo=gene2goInfo[gene2goInfo$go_id != "", ]
# 上述是一个gene对应一个go id,需要合并为一个gene对应多个go id,利用by函数(神器)
geneID2GO = by(gene2goInfo$go_id, gene2goInfo$ensembl_gene_id, function(x) as.character(x))

geneID2GO是符合topGO要求的,gene2goInfo和geneID2GO的格式如下

> head(gene2goInfo)
  ensembl_gene_id      go_id entrezgene                         name_1006
3 ENSG00000281614 GO:0005886       3635                   plasma membrane
4 ENSG00000281614 GO:0005829       3635                           cytosol
5 ENSG00000281614 GO:0005515       3635                   protein binding
6 ENSG00000281614 GO:0007165       3635               signal transduction
7 ENSG00000281614 GO:0005856       3635                      cytoskeleton
8 ENSG00000281614 GO:0050852       3635 T cell receptor signaling pathway
  go_linkage_type     namespace_1003
3             IEA cellular_component
4             TAS cellular_component
5             IPI molecular_function
6             TAS biological_process
7             IEA cellular_component
8             TAS biological_process

> head(geneID2GO)
$ENSG00000000003
[1] "GO:0004871" "GO:0005515" "GO:0005887" "GO:0070062" "GO:0007166"
[6] "GO:0043123" "GO:0039532" "GO:1901223"

$ENSG00000000005
[1] "GO:0016021" "GO:0005737" "GO:0005515" "GO:0005635" "GO:0016525"
[6] "GO:0001886" "GO:0001937" "GO:0071773" "GO:0035990"

二,构建基因列表:geneList

diff=read.table("./gene_exp.diff",sep="t",header=TRUE)
# 差异表达基因
interesting_genes=factor(diff[diff$significant=="yes",]$gene_id)
# 所有基因
all_genes <- diff$gene_id
# 构建基因列表
geneList <- factor(as.integer (all_genes %in% interesting_genes))
names(geneList)=all_genes

三,构建topGO对象

source("https://bioconductor.org/biocLite.R")
chooseBioCmirror()
biocLite("topGO")
biocLite("org.Hs.eg.db")
biocLite("GO.db")
biocLite("Rgraphviz")
library(topGO)

GOdata <- new("topGOdata", ontology = "MF", allGenes = geneList,annot = annFUN.gene2GO, gene2GO = geneID2GO)

Building most specific GOs …..
( 3893 GO terms found. )

Build GO DAG topology ……….
( 4356 GO terms and 5490 relations. )

Annotating nodes ……………
( 16722 genes annotated to the GO terms. )

四,基因富集分析

Fisher检验,通过2*2的列联表的进行计算。当然还有其他检验,比如KS,KS.elim。

—————-非差异表达基因—差异表达基因
注释到A通路——– 20 ———– 50
没有注释到A通路—- 1870 ——— 80

resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
allRes <- GenTable(GOdata,classicFisher = resultFisher, orderBy = "classicFisher", topNodes = 10)

根据orderBy的参数对象进行排序,得到结果如下

GO.ID Term Annotated Significant
1 GO:0032794 GTPase activating protein binding 14 2
2 GO:0015278 calcium-release channel activity 16 2
3 GO:0099604 ligand-gated calcium channel activity 16 2
4 GO:0005527 macrolide binding 19 2
5 GO:0005528 FK506 binding 19 2
6 GO:0001225 RNA polymerase II transcription coactiva… 1 1

Expected classicFisher
1 0.05 0.0012
2 0.06 0.0016
3 0.06 0.0016
4 0.07 0.0023
5 0.07 0.0023
6 0.00 0.0038

五,生成有向无环图(directed acycline praph,DAG)

svg("go.dag.svg")
showSigOfNodes(GOdata, score(resultFisher), firstSigNodes = 5, useInfo = 'all')
dev.off()

继续阅读

更新Bioconductor包–update a Bioconductor package

A package belong to Bioconductor was updated (major revision) and released. I want to use the up to date package while in analysing. I am willing to use the new feature, so I need to update this package. I had tried many ways and many times, and finally found a possible way.

更新Bioconductor中的特定包到最新版本。需要首先更新R,其次更新Bioconductor,最终更新包。

1, First your should update your R version.

sudo apt-get update
sudo apt-get install r-base

This will allow the latest Bioconductor to work compitablly.

2, Second update Bioconductor

remove.packages("BiocInstaller")  
source("http://bioconductor.org/biocLite.R") 
biocLite()  

# this will fix an error: Error: Bioconductor version *** cannot be upgraded with R version ***
# install latest BiocInstaller
# update Bioconductor

3,Third update your target package

remove.packages("package-name")   
biocLite("package-name")  

# remove old version
# install the latest version

#####################################################################
#版权所有 转载请告知 版权归作者所有 如有侵权 一经发现 必将追究其法律责任
#Author: Jason
####################################################################