首先,需要加载相关的R包和BSgenome.Hsapiens.UCSC.hg38参考基因组:
``` r
library(BSgenome.Hsapiens.UCSC.hg38)
library(dplyr)
library(ggplot2)
```
然后,读取多个L1的插入位点的染色体坐标excel表格,包含染色体名称chr,起始和终止坐标:
``` r
L1_insertions <- read.csv("L1_insertions.csv")
```
接下来,计算在R包BSgenome.Hsapiens.UCSC.hg38参考基因组上下游2kb范围内,7个特定的motifs (TTAAAA, TTAAGA, TTAGAA, TTGAAA, TTAAAG, CTAAAA, TCAAAA)在L1插入位点的百分比密度分布:
``` r
motifs <- c("TTAAAA", "TTAAGA", "TTAGAA", "TTGAAA", "TTAAAG", "CTAAAA", "TCAAAA")
L1_insertions_motifs <- L1_insertions %>%
mutate(start = pmax(start - 2000, 1),
end = end + 2000) %>%
mutate(sequence = mapply(function(chrom, start, end) {
as.character(BSgenome.Hsapiens.UCSC.hg38[[chrom]][start:end])
}, chrom, start, end)) %>%
mutate_at(vars(sequence), ~ str_count(., motifs)) %>%
select(-c(start, end))
L1_insertions_motifs_density <- L1_insertions_motifs %>%
summarise_at(vars(motifs), ~ sum(.)/length(sequence)/4) %>%
gather(key = "motif", value = "density")
```
最后,画直方图:
``` r
ggplot(L1_insertions_motifs_density, aes(x = motif, y = density)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Motif") +
ylab("Density") +
ggtitle("L1 Insertion Site Motif Density Distribution") +
theme(plot.title = element_text(hjust = 0.5))
```
完整的代码如下:
``` r
library(BSgenome.Hsapiens.UCSC.hg38)
library(dplyr)
library(ggplot2)
L1_insertions <- read.csv("L1_insertions.csv")
motifs <- c("TTAAAA", "TTAAGA", "TTAGAA", "TTGAAA", "TTAAAG", "CTAAAA", "TCAAAA")
L1_insertions_motifs <- L1_insertions %>%
mutate(start = pmax(start - 2000, 1),
end = end + 2000) %>%
mutate(sequence = mapply(function(chrom, start, end) {
as.character(BSgenome.Hsapiens.UCSC.hg38[[chrom]][start:end])
}, chrom, start, end)) %>%
mutate_at(vars(sequence), ~ str_count(., motifs)) %>%
select(-c(start, end))
L1_insertions_motifs_density <- L1_insertions_motifs %>%
summarise_at(vars(motifs), ~ sum(.)/length(sequence)/4) %>%
gather(key = "motif", value = "density")
ggplot(L1_insertions_motifs_density, aes(x = motif, y = density)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Motif") +
ylab("Density") +
ggtitle("L1 Insertion Site Motif Density Distribution") +
theme(plot.title = element_text(hjust = 0.5))
```
结果将会是一个展示7个motifs在L1插入位点的百分比密度分布的直方图。
阅读全文
收起全文