Package URL: https://github.com/jokergoo/EnrichedHeatmap
Package Doc: http://jokergoo.github.io/EnrichedHeatmap/articles/EnrichedHeatmap.html
Demo Dataset: GSE80779 and GSE80774 from ENL links histone acetylation to oncogenic gene expression in AML. Nature. 2017;543(7644):265-269
License: GPL-3.0
In this tutorial, we will use some of the processed data from GSE80779 ChIP-seq and GSE80774 RNA-seq to demonstrate how to use EnrichedHeatmap to show enrichment profiles. In order to build the vignette fast, the data only includes chromosome 22 of human genome (hg19).
Data | Type | Description |
---|---|---|
GSM2136938_MOLM13.H3K27ac | ChIP-seq | Enriched at active enhancers |
GSM2136939_MOLM13.H3K4me1 | ChIP-seq | Enriched at enhancers (both active and poised) |
GSM2136940_MOLM13.H3K4me3 | ChIP-seq | Enriched at actively transcribed promoters |
GSM2136941_MOLM13.H3K9ac | ChIP-seq | Enriched at active enhancers |
GSM2136954_MOLM13.POL2_sgGFP | ChIP-seq | Pol II, Control |
GSM2136952_MOLM13.POL2-S2P_sgGFP | ChIP-seq | Pol II Ser2P, Control |
GSM2136953_MOLM13.POL2_sgENL5 | ChIP-seq | Pol II - ENL KD |
GSM2136933_MOLM13.CDK9_Control | ChIP-seq | CDK9, Control (super elongation complex (SEC) component) |
GSM2136934_MOLM13.CDK9_ENLKO | ChIP-seq | CDK9 - ENL KD |
GSE80774_MOLM13-sgGFP-DMSO | RNA-seq | Control |
Data | Description |
---|---|
Gene annotation | From GENCODE v34lift37 |
git clone https://github.com/ycl6/EnrichedHeatmap-Demo.git
suppressPackageStartupMessages({
library(data.table) # Also requires R.utils to read gz and bz2 files
library(GenomicRanges)
library(EnrichedHeatmap)
library(circlize)
})
Change the path to correspond to the location of the data files.
## [1] "gencode.chr22.txt.gz"
## [2] "GSE80774_MOLM13-sgGFP-DMSO.RNAseq.counttab.txt.gz"
## [3] "GSM2136933_MOLM13.CDK9_Control.chr22.bdg.gz"
## [4] "GSM2136934_MOLM13.CDK9_ENLKO.chr22.bdg.gz"
## [5] "GSM2136938_MOLM13.H3K27ac.chr22.bdg.gz"
## [6] "GSM2136939_MOLM13.H3K4me1.chr22.bdg.gz"
## [7] "GSM2136940_MOLM13.H3K4me3.chr22.bdg.gz"
## [8] "GSM2136941_MOLM13.H3K9ac.chr22.bdg.gz"
## [9] "GSM2136952_MOLM13.POL2-S2P_sgGFP.chr22.bdg.gz"
## [10] "GSM2136953_MOLM13.POL2_sgENL5.chr22.bdg.gz"
## [11] "GSM2136954_MOLM13.POL2_sgGFP.chr22.bdg.gz"
DT_H3K27ac <- fread(paste0(data, "GSM2136938_MOLM13.H3K27ac.chr22.bdg.gz"))
DT_H3K4me1 <- fread(paste0(data, "GSM2136939_MOLM13.H3K4me1.chr22.bdg.gz"))
DT_H3K4me3 <- fread(paste0(data, "GSM2136940_MOLM13.H3K4me3.chr22.bdg.gz"))
DT_H3K9ac <- fread(paste0(data, "GSM2136941_MOLM13.H3K9ac.chr22.bdg.gz"))
DT_POL2 <- fread(paste0(data, "GSM2136954_MOLM13.POL2_sgGFP.chr22.bdg.gz"))
DT_POL2ENLKD <- fread(paste0(data, "GSM2136953_MOLM13.POL2_sgENL5.chr22.bdg.gz"))
DT_POL2S2P <- fread(paste0(data, "GSM2136952_MOLM13.POL2-S2P_sgGFP.chr22.bdg.gz"))
DT_CDK9 <- fread(paste0(data, "GSM2136933_MOLM13.CDK9_Control.chr22.bdg.gz"))
DT_CDK9ENLKD <- fread(paste0(data, "GSM2136934_MOLM13.CDK9_ENLKO.chr22.bdg.gz"))
DT_genes <- fread(paste0(data, "gencode.chr22.txt.gz"))
rnaseq <- data.frame(fread(paste0(data, "GSE80774_MOLM13-sgGFP-DMSO.RNAseq.counttab.txt.gz")))
rownames(rnaseq) <- rnaseq$gene_id
colnames(rnaseq)[2:3] <- c("rep1","rep2")
rnaseq$mean <- apply(rnaseq[,2:3], 1, mean)
# Calculate CPM using edgeR
rnaseq$cpm <- edgeR::cpm(rnaseq$mean)
# Subset rnaseq and genes to include genes present in both Data Frames
rnaseq <- rnaseq[rnaseq$gene_id %in% DT_genes$V4,]
DT_genes <- DT_genes[DT_genes$V4 %in% rnaseq$gene_id,]
# Reorder rows according to DT_genes
rnaseq <- rnaseq[DT_genes$V4,]
## [1] 1215
## [1] 1215
Note: You can also import annotation from GFF or GTF using the
import.gff3
orimport
from thertracklayer
package.
gff_file <- "gencode.v34lift37.annotation.gff3.gz"
gff_url <- "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/GRCh37_mapping/gencode.v34lift37.annotation.gff3.gz"
# Download GFF from GENCODE
download.file(gff_url, paste0(data, "/", gff_file))
# Create GRanges object from GFF
ggff <- rtracklayer::import.gff3("data/gencode.v34lift37.annotation.gff3.gz")
# To subset gene entries from chr22 from ggff
genes <- ggff[seqnames(ggff) == "chr22" & ggff$type == "gene"]
names(genes) <- substr(genes$gene_id, 1, 15)
genes
## GRanges object with 1391 ranges and 31 metadata columns:
## seqnames ranges strand | source type score
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric>
## ENSG00000233866 chr22 16062157-16063236 + | HAVANA gene NA
## ENSG00000229286 chr22 16076052-16076172 - | HAVANA gene NA
## ENSG00000235265 chr22 16084249-16084826 - | HAVANA gene NA
## ENSG00000223875 chr22 16100517-16124973 - | HAVANA gene NA
## ENSG00000215270 chr22 16122720-16123768 + | HAVANA gene NA
## ... ... ... ... . ... ... ...
## ENSG00000100312 chr22 51176624-51183767 + | HAVANA gene NA
## ENSG00000254499 chr22 51179021-51181948 - | HAVANA gene NA
## ENSG00000213683 chr22 51193103-51193862 - | HAVANA gene NA
## ENSG00000184319 chr22 51195376-51239737 + | HAVANA gene NA
## ENSG00000079974 chr22 51205929-51222091 - | HAVANA gene NA
## phase ID gene_id
## <integer> <character> <character>
## ENSG00000233866 <NA> ENSG00000233866.1 ENSG00000233866.1
## ENSG00000229286 <NA> ENSG00000229286.1 ENSG00000229286.1
## ENSG00000235265 <NA> ENSG00000235265.1 ENSG00000235265.1
## ENSG00000223875 <NA> ENSG00000223875.1 ENSG00000223875.1
## ENSG00000215270 <NA> ENSG00000215270.3 ENSG00000215270.3
## ... ... ... ...
## ENSG00000100312 <NA> ENSG00000100312.11 ENSG00000100312.11_4
## ENSG00000254499 <NA> ENSG00000254499.1 ENSG00000254499.1_7
## ENSG00000213683 <NA> ENSG00000213683.4 ENSG00000213683.4_5
## ENSG00000184319 <NA> ENSG00000184319.16 ENSG00000184319.16_6
## ENSG00000079974 <NA> ENSG00000079974.17 ENSG00000079974.17_5
## gene_type gene_name level hgnc_id
## <character> <character> <character> <character>
## ENSG00000233866 lincRNA LA16c-4G1.3 2 <NA>
## ENSG00000229286 pseudogene LA16c-4G1.4 2 <NA>
## ENSG00000235265 pseudogene LA16c-4G1.5 2 <NA>
## ENSG00000223875 pseudogene NBEAP3 2 <NA>
## ENSG00000215270 pseudogene LA16c-60H5.7 1 <NA>
## ... ... ... ... ...
## ENSG00000100312 protein_coding ACR 1 HGNC:126
## ENSG00000254499 lncRNA AC002056.2 2 <NA>
## ENSG00000213683 processed_pseudogene AC002056.1 1 <NA>
## ENSG00000184319 transcribed_unprocessed_pseudogene RPL23AP82 1 HGNC:33730
## ENSG00000079974 protein_coding RABL2B 2 HGNC:9800
## havana_gene remap_status remap_num_mappings
## <character> <character> <character>
## ENSG00000233866 OTTHUMG00000140195.1 <NA> <NA>
## ENSG00000229286 OTTHUMG00000140193.1 <NA> <NA>
## ENSG00000235265 OTTHUMG00000140197.1 <NA> <NA>
## ENSG00000223875 OTTHUMG00000140196.1 <NA> <NA>
## ENSG00000215270 OTTHUMG00000140200.1 <NA> <NA>
## ... ... ... ...
## ENSG00000100312 OTTHUMG00000150155.1_4 full_contig 1
## ENSG00000254499 OTTHUMG00000166231.1_7 full_contig 1
## ENSG00000213683 OTTHUMG00000150159.1_5 full_contig 1
## ENSG00000184319 OTTHUMG00000150154.1_6 full_contig 1
## ENSG00000079974 OTTHUMG00000150156.1_5 full_contig 1
## remap_target_status Parent transcript_id transcript_type
## <character> <CharacterList> <character> <character>
## ENSG00000233866 <NA> <NA> <NA>
## ENSG00000229286 <NA> <NA> <NA>
## ENSG00000235265 <NA> <NA> <NA>
## ENSG00000223875 <NA> <NA> <NA>
## ENSG00000215270 <NA> <NA> <NA>
## ... ... ... ... ...
## ENSG00000100312 overlap <NA> <NA>
## ENSG00000254499 overlap <NA> <NA>
## ENSG00000213683 overlap <NA> <NA>
## ENSG00000184319 overlap <NA> <NA>
## ENSG00000079974 overlap <NA> <NA>
## transcript_name transcript_support_level tag
## <character> <character> <CharacterList>
## ENSG00000233866 <NA> <NA>
## ENSG00000229286 <NA> <NA>
## ENSG00000235265 <NA> <NA>
## ENSG00000223875 <NA> <NA>
## ENSG00000215270 <NA> <NA> pseudo_consens
## ... ... ... ...
## ENSG00000100312 <NA> <NA>
## ENSG00000254499 <NA> <NA>
## ENSG00000213683 <NA> <NA> pseudo_consens
## ENSG00000184319 <NA> <NA> pseudo_consens
## ENSG00000079974 <NA> <NA>
## havana_transcript exon_number exon_id remap_original_location
## <character> <character> <character> <character>
## ENSG00000233866 <NA> <NA> <NA> <NA>
## ENSG00000229286 <NA> <NA> <NA> <NA>
## ENSG00000235265 <NA> <NA> <NA> <NA>
## ENSG00000223875 <NA> <NA> <NA> <NA>
## ENSG00000215270 <NA> <NA> <NA> <NA>
## ... ... ... ... ...
## ENSG00000100312 <NA> <NA> <NA> <NA>
## ENSG00000254499 <NA> <NA> <NA> <NA>
## ENSG00000213683 <NA> <NA> <NA> <NA>
## ENSG00000184319 <NA> <NA> <NA> <NA>
## ENSG00000079974 <NA> <NA> <NA> <NA>
## ont protein_id ccdsid gene_status
## <CharacterList> <character> <character> <character>
## ENSG00000233866 <NA> <NA> KNOWN
## ENSG00000229286 <NA> <NA> KNOWN
## ENSG00000235265 <NA> <NA> KNOWN
## ENSG00000223875 <NA> <NA> KNOWN
## ENSG00000215270 <NA> <NA> KNOWN
## ... ... ... ... ...
## ENSG00000100312 <NA> <NA> <NA>
## ENSG00000254499 <NA> <NA> <NA>
## ENSG00000213683 <NA> <NA> <NA>
## ENSG00000184319 <NA> <NA> <NA>
## ENSG00000079974 <NA> <NA> <NA>
## remap_substituted_missing_target transcript_status remap_original_id
## <character> <character> <character>
## ENSG00000233866 V19 <NA> <NA>
## ENSG00000229286 V19 <NA> <NA>
## ENSG00000235265 V19 <NA> <NA>
## ENSG00000223875 V19 <NA> <NA>
## ENSG00000215270 V19 <NA> <NA>
## ... ... ... ...
## ENSG00000100312 <NA> <NA> <NA>
## ENSG00000254499 <NA> <NA> <NA>
## ENSG00000213683 <NA> <NA> <NA>
## ENSG00000184319 <NA> <NA> <NA>
## ENSG00000079974 <NA> <NA> <NA>
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
gtf_file <- "gencode.v34lift37.annotation.gtf.gz"
gtf_url <- "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/GRCh37_mapping/gencode.v34lift37.annotation.gtf.gz"
# Download GTF from GENCODE
download.file(gtf_url, paste0(data, "/", gtf_file))
# Create GRanges object from GTF
ggtf <- rtracklayer::import("data/gencode.v34lift37.annotation.gtf.gz")
# or from ggtf
genes <- ggtf[seqnames(ggtf) == "chr22" & ggtf$type == "gene"]
names(genes) <- substr(genes$gene_id, 1, 15)
genes
## GRanges object with 1391 ranges and 28 metadata columns:
## seqnames ranges strand | source type score
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric>
## ENSG00000233866 chr22 16062157-16063236 + | HAVANA gene NA
## ENSG00000229286 chr22 16076052-16076172 - | HAVANA gene NA
## ENSG00000235265 chr22 16084249-16084826 - | HAVANA gene NA
## ENSG00000223875 chr22 16100517-16124973 - | HAVANA gene NA
## ENSG00000215270 chr22 16122720-16123768 + | HAVANA gene NA
## ... ... ... ... . ... ... ...
## ENSG00000100312 chr22 51176624-51183767 + | HAVANA gene NA
## ENSG00000254499 chr22 51179021-51181948 - | HAVANA gene NA
## ENSG00000213683 chr22 51193103-51193862 - | HAVANA gene NA
## ENSG00000184319 chr22 51195376-51239737 + | HAVANA gene NA
## ENSG00000079974 chr22 51205929-51222091 - | HAVANA gene NA
## phase gene_id gene_type
## <integer> <character> <character>
## ENSG00000233866 <NA> ENSG00000233866.1 lincRNA
## ENSG00000229286 <NA> ENSG00000229286.1 pseudogene
## ENSG00000235265 <NA> ENSG00000235265.1 pseudogene
## ENSG00000223875 <NA> ENSG00000223875.1 pseudogene
## ENSG00000215270 <NA> ENSG00000215270.3 pseudogene
## ... ... ... ...
## ENSG00000100312 <NA> ENSG00000100312.11_4 protein_coding
## ENSG00000254499 <NA> ENSG00000254499.1_7 lncRNA
## ENSG00000213683 <NA> ENSG00000213683.4_5 processed_pseudogene
## ENSG00000184319 <NA> ENSG00000184319.16_6 transcribed_unprocessed_pseudogene
## ENSG00000079974 <NA> ENSG00000079974.17_5 protein_coding
## gene_name level hgnc_id havana_gene
## <character> <character> <character> <character>
## ENSG00000233866 LA16c-4G1.3 2 <NA> OTTHUMG00000140195.1
## ENSG00000229286 LA16c-4G1.4 2 <NA> OTTHUMG00000140193.1
## ENSG00000235265 LA16c-4G1.5 2 <NA> OTTHUMG00000140197.1
## ENSG00000223875 NBEAP3 2 <NA> OTTHUMG00000140196.1
## ENSG00000215270 LA16c-60H5.7 1 <NA> OTTHUMG00000140200.1
## ... ... ... ... ...
## ENSG00000100312 ACR 1 HGNC:126 OTTHUMG00000150155.1_4
## ENSG00000254499 AC002056.2 2 <NA> OTTHUMG00000166231.1_7
## ENSG00000213683 AC002056.1 1 <NA> OTTHUMG00000150159.1_5
## ENSG00000184319 RPL23AP82 1 HGNC:33730 OTTHUMG00000150154.1_6
## ENSG00000079974 RABL2B 2 HGNC:9800 OTTHUMG00000150156.1_5
## remap_status remap_num_mappings remap_target_status transcript_id
## <character> <character> <character> <character>
## ENSG00000233866 <NA> <NA> <NA> <NA>
## ENSG00000229286 <NA> <NA> <NA> <NA>
## ENSG00000235265 <NA> <NA> <NA> <NA>
## ENSG00000223875 <NA> <NA> <NA> <NA>
## ENSG00000215270 <NA> <NA> <NA> <NA>
## ... ... ... ... ...
## ENSG00000100312 full_contig 1 overlap <NA>
## ENSG00000254499 full_contig 1 overlap <NA>
## ENSG00000213683 full_contig 1 overlap <NA>
## ENSG00000184319 full_contig 1 overlap <NA>
## ENSG00000079974 full_contig 1 overlap <NA>
## transcript_type transcript_name transcript_support_level tag
## <character> <character> <character> <character>
## ENSG00000233866 <NA> <NA> <NA> <NA>
## ENSG00000229286 <NA> <NA> <NA> <NA>
## ENSG00000235265 <NA> <NA> <NA> <NA>
## ENSG00000223875 <NA> <NA> <NA> <NA>
## ENSG00000215270 <NA> <NA> <NA> pseudo_consens
## ... ... ... ... ...
## ENSG00000100312 <NA> <NA> <NA> <NA>
## ENSG00000254499 <NA> <NA> <NA> <NA>
## ENSG00000213683 <NA> <NA> <NA> pseudo_consens
## ENSG00000184319 <NA> <NA> <NA> pseudo_consens
## ENSG00000079974 <NA> <NA> <NA> <NA>
## havana_transcript exon_number exon_id remap_original_location
## <character> <character> <character> <character>
## ENSG00000233866 <NA> <NA> <NA> <NA>
## ENSG00000229286 <NA> <NA> <NA> <NA>
## ENSG00000235265 <NA> <NA> <NA> <NA>
## ENSG00000223875 <NA> <NA> <NA> <NA>
## ENSG00000215270 <NA> <NA> <NA> <NA>
## ... ... ... ... ...
## ENSG00000100312 <NA> <NA> <NA> <NA>
## ENSG00000254499 <NA> <NA> <NA> <NA>
## ENSG00000213683 <NA> <NA> <NA> <NA>
## ENSG00000184319 <NA> <NA> <NA> <NA>
## ENSG00000079974 <NA> <NA> <NA> <NA>
## ont protein_id ccdsid gene_status
## <character> <character> <character> <character>
## ENSG00000233866 <NA> <NA> <NA> KNOWN
## ENSG00000229286 <NA> <NA> <NA> KNOWN
## ENSG00000235265 <NA> <NA> <NA> KNOWN
## ENSG00000223875 <NA> <NA> <NA> KNOWN
## ENSG00000215270 <NA> <NA> <NA> KNOWN
## ... ... ... ... ...
## ENSG00000100312 <NA> <NA> <NA> <NA>
## ENSG00000254499 <NA> <NA> <NA> <NA>
## ENSG00000213683 <NA> <NA> <NA> <NA>
## ENSG00000184319 <NA> <NA> <NA> <NA>
## ENSG00000079974 <NA> <NA> <NA> <NA>
## remap_substituted_missing_target transcript_status
## <character> <character>
## ENSG00000233866 V19 <NA>
## ENSG00000229286 V19 <NA>
## ENSG00000235265 V19 <NA>
## ENSG00000223875 V19 <NA>
## ENSG00000215270 V19 <NA>
## ... ... ...
## ENSG00000100312 <NA> <NA>
## ENSG00000254499 <NA> <NA>
## ENSG00000213683 <NA> <NA>
## ENSG00000184319 <NA> <NA>
## ENSG00000079974 <NA> <NA>
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
GRanges
objectsH3K27ac <- GRanges(seqnames = DT_H3K27ac$V1,
ranges = IRanges(start = DT_H3K27ac$V2+1, end = DT_H3K27ac$V3),
coverage = DT_H3K27ac$V4)
H3K4me1 <- GRanges(seqnames = DT_H3K4me1$V1,
ranges = IRanges(start = DT_H3K4me1$V2+1, end = DT_H3K4me1$V3),
coverage = DT_H3K4me1$V4)
H3K4me3 <- GRanges(seqnames = DT_H3K4me3$V1,
ranges = IRanges(start = DT_H3K4me3$V2+1, end = DT_H3K4me3$V3),
coverage = DT_H3K4me3$V4)
H3K9ac <- GRanges(seqnames = DT_H3K9ac$V1,
ranges = IRanges(start = DT_H3K9ac$V2+1, end = DT_H3K9ac$V3),
coverage = DT_H3K9ac$V4)
POL2 <- GRanges(seqnames = DT_POL2$V1,
ranges = IRanges(start = DT_POL2$V2+1, end = DT_POL2$V3),
coverage = DT_POL2$V4)
POL2ENLKD <- GRanges(seqnames = DT_POL2ENLKD$V1,
ranges = IRanges(start = DT_POL2ENLKD$V2+1, end = DT_POL2ENLKD$V3),
coverage = DT_POL2ENLKD$V4)
POL2S2P <- GRanges(seqnames = DT_POL2S2P$V1,
ranges = IRanges(start = DT_POL2S2P$V2+1, end = DT_POL2S2P$V3),
coverage = DT_POL2S2P$V4)
CDK9 <- GRanges(seqnames = DT_CDK9$V1,
ranges = IRanges(start = DT_CDK9$V2+1, end = DT_CDK9$V3),
coverage = DT_CDK9$V4)
CDK9ENLKD <- GRanges(seqnames = DT_CDK9ENLKD$V1,
ranges = IRanges(start = DT_CDK9ENLKD$V2+1, end = DT_CDK9ENLKD$V3),
coverage = DT_CDK9ENLKD$V4)
genes <- GRanges(seqnames = DT_genes$V1,
ranges = IRanges(start = DT_genes$V2+1, end = DT_genes$V3),
names = DT_genes$V4, strand = DT_genes$V5)
names(genes) <- DT_genes$V4
Let’s have a look at H3K27ac
.
## GRanges object with 567374 ranges and 1 metadata column:
## seqnames ranges strand | coverage
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr22 16050371-16050570 * | 1
## [2] chr22 16050571-16053780 * | 0
## [3] chr22 16053781-16053860 * | 1
## [4] chr22 16053861-16053980 * | 2
## [5] chr22 16053981-16054060 * | 1
## ... ... ... ... . ...
## [567370] chr22 51235611-51235680 * | 1
## [567371] chr22 51235681-51237100 * | 0
## [567372] chr22 51237101-51237290 * | 1
## [567373] chr22 51237291-51237300 * | 2
## [567374] chr22 51237301-51237490 * | 1
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
GRanges
object## GRanges object with 1215 ranges and 1 metadata column:
## seqnames ranges strand | names
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000230471 chr22 16373082-16377055 * | ENSG00000230471
## ENSG00000230643 chr22 16389486-16389602 * | ENSG00000230643
## ENSG00000225293 chr22 16868421-16871613 * | ENSG00000225293
## ENSG00000226160 chr22 16886057-16886143 * | ENSG00000226160
## ENSG00000229658 chr22 16904669-16906756 * | ENSG00000229658
## ... ... ... ... . ...
## ENSG00000271796 chr22 20434078-20434153 * | ENSG00000271796
## ENSG00000272019 chr22 23474920-23475192 * | ENSG00000272019
## ENSG00000272675 chr22 18036251-18037847 * | ENSG00000272675
## ENSG00000272872 chr22 16154074-16154766 * | ENSG00000272872
## ENSG00000272955 chr22 24375563-24376211 * | ENSG00000272955
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
## GRanges object with 1215 ranges and 1 metadata column:
## seqnames ranges strand | names
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000230471 chr22 16373082 * | ENSG00000230471
## ENSG00000230643 chr22 16389486 * | ENSG00000230643
## ENSG00000225293 chr22 16868421 * | ENSG00000225293
## ENSG00000226160 chr22 16886057 * | ENSG00000226160
## ENSG00000229658 chr22 16904669 * | ENSG00000229658
## ... ... ... ... . ...
## ENSG00000271796 chr22 20434078 * | ENSG00000271796
## ENSG00000272019 chr22 23474920 * | ENSG00000272019
## ENSG00000272675 chr22 18036251 * | ENSG00000272675
## ENSG00000272872 chr22 16154074 * | ENSG00000272872
## ENSG00000272955 chr22 24375563 * | ENSG00000272955
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
normalizedMatrix
objectsHere we obtain the association between ChIP-seq peaks and targets (i.e. TSS) by normalizing into a matrix. We specify the targets regions to be extended to 5Kb upstream and downstream of TSS (extend = 5000
).
For each region, it splits into a list of 50 small windows (w = 50
), then overlaps genomic signals to these small windows and calculates the value for every small window which is the mean value of genomic signals that intersects with the window. The value used for the calculation is controlled by value_column
and how to calcualte the mean value is controlled by mean_mode
. There are several modes for mean_mode according to different types of genomic signals, details can be found here.
40 50 20 values in signal regions (e.g. Coverage)
++++++ +++ +++++ signal regions (e.g. ChIP-seq peaks)
30 values in signal region
++++++ signal region
================= a 17-bp window, and 4 bases did not overlap to any signal regions
**** *** ***
4 3 3 overlap bases
******
6 overlap bases
Mode | Description | Calculation |
---|---|---|
absolute | The mean of all signal regions regardless of their width | (40 + 30 + 50 + 20)/4 |
weighted | The mean of all signal regions weighted by the width of their intersections | (40*4 + 30*6 + 50*3 + 20*3)/(4 + 6 + 3 + 3) |
w0 | The weighted mean between the intersected parts and un-intersected parts | (40*4 + 30*6 + 50*3 + 20*3)/(4 + 6 + 3 + 3 + 4) |
coverage | The mean signal averged by the length of the window | (40*4 + 30*6 + 50*3 + 20*3)/17 |
tss_H3K27ac <- normalizeToMatrix(H3K27ac, tss, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50)
tss_H3K4me1 <- normalizeToMatrix(H3K4me1, tss, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50)
tss_H3K4me3 <- normalizeToMatrix(H3K4me3, tss, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50)
tss_H3K9ac <- normalizeToMatrix(H3K9ac, tss, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50)
tss_POL2 <- normalizeToMatrix(POL2, tss, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50)
Let’s have a look at tss_H3K27ac
.
## Normalize H3K27ac to tss:
## Upstream 5000 bp (100 windows)
## Downstream 5000 bp (100 windows)
## Include target regions (width = 1)
## 1215 target regions
How about the windows and values around TSS?
## u91 u92 u93 u94 u95 u96 u97 u98 u99 u100 d1 d2
## ENSG00000100056 1.00 1.20 1.00 1.28 3.00 3.44 4.00 4.24 5.00 5.20 6.08 6.76
## ENSG00000206203 2.02 3.00 2.18 2.20 1.98 1.84 3.00 2.18 2.42 2.16 1.84 3.00
## ENSG00000063515 3.40 4.62 3.00 3.58 3.44 2.00 2.00 1.42 1.00 0.02 0.00 0.00
## ENSG00000260924 5.96 6.72 10.16 15.72 16.12 19.32 23.64 26.76 36.08 47.88 53.04 52.52
## ENSG00000100075 12.92 12.56 12.84 10.24 9.24 6.16 6.36 7.60 8.04 10.48 9.44 9.56
## ENSG00000070371 9.00 10.32 8.68 10.52 9.60 8.88 9.28 7.20 7.68 6.16 4.40 5.08
## d3 d4 d5 d6 d7 d8 d9 d10
## ENSG00000100056 6.00 5.16 3.92 2.72 1.00 0.56 0.24 1.04
## ENSG00000206203 2.58 2.82 2.16 1.00 1.00 0.18 0.00 0.82
## ENSG00000063515 0.00 0.98 1.38 2.00 3.36 3.02 2.62 2.38
## ENSG00000260924 46.76 31.92 23.60 25.04 36.00 61.92 83.52 92.76
## ENSG00000100075 10.16 7.32 6.52 6.32 4.32 5.16 6.56 6.24
## ENSG00000070371 3.28 3.00 2.88 2.04 5.84 6.00 6.52 6.96
Visualize the ChIP-seq profile around TSS by heatmap
EnrichedHeatmap(tss_H3K27ac, col = c("white", "darkgreen"), name = "H3K27ac",
column_title = "H3K27ac coverage around TSS")
H3K27ac profile (before color optimisation)
Output multiple images to PDF.
pdf("EnrichedHeatmap_1.pdf", width = 4, height = 6, pointsize=12)
EnrichedHeatmap(tss_H3K27ac, col = c("white", "darkgreen"), name = "H3K27ac",
column_title = "H3K27ac coverage around TSS")
EnrichedHeatmap(tss_H3K4me1, col = c("white", "darkgreen"), name = "H3K4me1",
column_title = "H3K4me1 coverage around TSS")
EnrichedHeatmap(tss_H3K4me3, col = c("white", "darkgreen"), name = "H3K4me3",
column_title = "H3K4me3 coverage around TSS")
EnrichedHeatmap(tss_H3K9ac, col = c("white", "darkgreen"), name = "H3K9ac",
column_title = "H3K9ac coverage around TSS")
EnrichedHeatmap(tss_POL2, col = c("white", "blue"), name = "POL2",
column_title = "POL2 coverage around TSS")
invisible(dev.off())
If a vector of colors is specified, sequential values from minimal to maximal are mapped to the colors, and other values are linearly interpolated. Hence, extreme values can affect the color gradient.
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 3 6 14 27 68 205
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 2 3 7 12 17 26 57
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 2 7 80 150 236 315
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 2 3 5 10 26 76 162
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 2 5 10 18 45 352
To get around of such extreme values, we define a color mapping function which only maps colors to values less than Nth percentile and the value larger than the Nth percentile uses same color as the 99th percentile.
col1_fun1 <- colorRamp2(quantile(tss_H3K27ac, c(0, 0.99)), c("white", "darkgreen"))
col1_fun2 <- colorRamp2(quantile(tss_H3K4me1, c(0, 0.99)), c("white", "darkgreen"))
col1_fun3 <- colorRamp2(quantile(tss_H3K4me3, c(0, 0.95)), c("white", "darkgreen"))
col1_fun4 <- colorRamp2(quantile(tss_H3K9ac, c(0, 0.99)), c("white", "darkgreen"))
col1_fun5 <- colorRamp2(quantile(tss_POL2, c(0, 0.99)), c("white", "blue"))
EnrichedHeatmap(tss_H3K27ac, col = col1_fun1, name = "H3K27ac",
column_title = "H3K27ac coverage around TSS")
H3K27ac profile (after color optimisation)
Output multiple images to PDF.
pdf("EnrichedHeatmap_2.pdf", width = 4, height = 6, pointsize=12)
EnrichedHeatmap(tss_H3K27ac, col = col1_fun1, name = "H3K27ac",
column_title = "H3K27ac coverage around TSS")
EnrichedHeatmap(tss_H3K4me1, col = col1_fun2, name = "H3K4me1",
column_title = "H3K4me1 coverage around TSS")
EnrichedHeatmap(tss_H3K4me3, col = col1_fun3, name = "H3K4me3",
column_title = "H3K4me3 coverage around TSS")
EnrichedHeatmap(tss_H3K9ac, col = col1_fun4, name = "H3K9ac",
column_title = "H3K9ac coverage around TSS")
EnrichedHeatmap(tss_POL2, col = col1_fun5, name = "POL2",
column_title = "POL2 coverage around TSS")
invisible(dev.off())
We can place multiple heatmaps into a single image.
ht_list <- EnrichedHeatmap(tss_H3K27ac, col = col1_fun1, name = "H3K27ac",
column_title = "H3K27ac") +
EnrichedHeatmap(tss_H3K4me1, col = col1_fun2, name = "H3K4me1", column_title = "H3K4me1") +
EnrichedHeatmap(tss_H3K4me3, col = col1_fun3, name = "H3K4me3", column_title = "H3K4me3") +
EnrichedHeatmap(tss_H3K9ac, col = col1_fun4, name = "H3K9ac", column_title = "H3K9ac") +
EnrichedHeatmap(tss_POL2, col = col1_fun5, name = "POL2", column_title = "POL2") +
Heatmap(log2(rnaseq$cpm+1), col = c("white", "black"), name = "log2(CPM+1)",
show_row_names = FALSE, show_column_names = FALSE, width = unit(10, "mm"))
png("MultipleHeatmaps_1.png", width = 10, height = 10, units = "in", res = 300)
draw(ht_list, ht_gap = unit(c(7, 7, 7, 7, 7), "mm"))
invisible(dev.off())
Multiple heatmaps 1
We can also view enrichment profile over a genomic region.
Here, we will create normalized matrix of POL2 and CDK9 ChIP-seq signals over gene body. The Width of the target regions shown on heatmap can be controlled by target_ratio
which is relative to the width of the complete heatmap.
gb_POL2 <- normalizeToMatrix(POL2, genes, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50, target_ratio = 0.5)
gb_POL2ENLKD <- normalizeToMatrix(POL2ENLKD, genes, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50, target_ratio = 0.5)
gb_POL2S2P <- normalizeToMatrix(POL2S2P, genes, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50, target_ratio = 0.5)
gb_CDK9 <- normalizeToMatrix(CDK9, genes, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50, target_ratio = 0.5)
gb_CDK9ENLKD <- normalizeToMatrix(CDK9ENLKD, genes, value_column = "coverage",
extend = 5000, mean_mode = "w0", w = 50, target_ratio = 0.5)
Check distribution.
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 2 5 10 18 45 352
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 2 4 9 16 41 320
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 1 2 3 4 6 23
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 2 3 4 5 6 29
## 0% 25% 50% 75% 90% 95% 99% 100%
## 0 1 2 3 4 5 6 25
Define color mapping functions.
col2_fun1 = colorRamp2(quantile(gb_POL2, c(0, 0.99)), c("white", "blue"))
col2_fun2 = colorRamp2(quantile(gb_POL2ENLKD, c(0, 0.99)), c("white", "blue"))
col2_fun3 = colorRamp2(quantile(gb_POL2S2P, c(0, 0.99)), c("white", "blue"))
col2_fun4 = colorRamp2(quantile(gb_CDK9, c(0, 0.99)), c("white", "red"))
col2_fun5 = colorRamp2(quantile(gb_CDK9ENLKD, c(0, 0.99)), c("white", "red"))
We will define 3 partitions according to kmeans
clustering of the expression profile, and split the heatmap accordingly.
set.seed(123)
partition <- paste0("cluster", kmeans(log2(rnaseq$cpm+1), centers = 3)$cluster)
lgd = Legend(at = c("cluster1", "cluster2", "cluster3"), title = "Clusters",
type = "lines", legend_gp = gpar(col = 3:5))
axis_name <- c("-5kb", "TSS", "TTS", "+5kb")
ht_list <- Heatmap(partition, col = structure(3:5, names = paste0("cluster", 1:3)),
name = "Partition", show_row_names = FALSE, show_column_names = FALSE,
width = unit(3, "mm")) +
Heatmap(log2(rnaseq$cpm+1), col = c("white", "black"), name = "log2(CPM+1)",
show_row_names = FALSE, show_column_names = FALSE, width = unit(15, "mm"),
top_annotation = HeatmapAnnotation(summary = anno_summary(gp = gpar(fill = 3:5),
outline = FALSE, axis_param = list(side = "left")))) +
EnrichedHeatmap(gb_POL2, col = col2_fun1, name = "POL2", column_title = "POL2",
axis_name = axis_name, axis_name_gp = gpar(fontsize = 8),
top_annotation = HeatmapAnnotation(lines = anno_enriched(gp = gpar(col = 3:5),
axis_param = list(side = "left")))) +
EnrichedHeatmap(gb_POL2ENLKD, col = col2_fun2, name = "POL2 ENLKD", column_title = "POL2 ENL-KD",
axis_name = axis_name, axis_name_gp = gpar(fontsize = 8),
top_annotation = HeatmapAnnotation(lines = anno_enriched(gp = gpar(col = 3:5),
axis_param = list(side = "left")))) +
EnrichedHeatmap(gb_POL2S2P, col = col2_fun3, name = "POL2 S2P", column_title = "POL2 S2P",
axis_name = axis_name, axis_name_gp = gpar(fontsize = 8),
top_annotation = HeatmapAnnotation(lines = anno_enriched(gp = gpar(col = 3:5),
axis_param = list(side = "left")))) +
EnrichedHeatmap(gb_CDK9, col = col2_fun4, name = "CDK9", column_title = "CDK9",
axis_name = axis_name, axis_name_gp = gpar(fontsize = 8),
top_annotation = HeatmapAnnotation(lines = anno_enriched(gp = gpar(col = 3:5),
axis_param = list(side = "left")))) +
EnrichedHeatmap(gb_CDK9ENLKD, col = col2_fun5, name = "CDK9 ENLKD", column_title = "CDK9 ENL-KD",
axis_name = axis_name, axis_name_gp = gpar(fontsize = 8),
top_annotation = HeatmapAnnotation(lines = anno_enriched(gp = gpar(col = 3:5),
axis_param = list(side = "left"))))
png("MultipleHeatmaps_2.png", width = 12, height = 10, units = "in", res = 300)
draw(ht_list, split = partition, ht_gap = unit(c(2, 7, 7, 7, 7, 7), "mm"))
invisible(dev.off())
Multiple heatmaps 2
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/r4/lib/libopenblasp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8
## [4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 parallel stats graphics grDevices utils datasets
## [9] methods base
##
## other attached packages:
## [1] circlize_0.4.10 EnrichedHeatmap_1.18.2 ComplexHeatmap_2.4.3
## [4] GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 IRanges_2.22.2
## [7] S4Vectors_0.26.1 BiocGenerics_0.34.0 data.table_1.13.0
## [10] knitr_1.29
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 highr_0.8 compiler_4.0.2
## [4] RColorBrewer_1.1-2 XVector_0.28.0 bitops_1.0-6
## [7] tools_4.0.2 zlibbioc_1.34.0 digest_0.6.25
## [10] lattice_0.20-41 evaluate_0.14 clue_0.3-57
## [13] png_0.1-7 rlang_0.4.7 yaml_2.2.1
## [16] xfun_0.16 GenomeInfoDbData_1.2.3 stringr_1.4.0
## [19] cluster_2.1.0 GlobalOptions_0.1.2 locfit_1.5-9.4
## [22] GetoptLong_1.0.2 rmarkdown_2.3 magrittr_1.5
## [25] codetools_0.2-16 matrixStats_0.56.0 htmltools_0.5.0
## [28] shape_1.4.4 colorspace_1.4-1 stringi_1.4.6
## [31] RCurl_1.98-1.2 crayon_1.3.4 rjson_0.2.20