DMR注释

bed:chromStart从0开始

gff3:chromStart从1开始

https://cloud.tencent.com/developer/article/1078324

https://www.plob.org/article/3748.html

A.bed

1
2
3
4
5
6
7
8
9
chr1	69200	70000	TCGA-3C-AAAU-10A-01D-A41E-01	53225	0.0055
chr1 95676511 95676518 TCGA-3C-AAAU-10A-01D-A41E-01 2 -1.6636
chr1 95680124 167057183 TCGA-3C-AAAU-10A-01D-A41E-01 24886 0.0053
chr1 167057495 167059336 TCGA-3C-AAAU-10A-01D-A41E-01 3 -1.0999
chr1 167059760 181602002 TCGA-3C-AAAU-10A-01D-A41E-01 9213 -8.00E-04
chr1 181603120 181609567 TCGA-3C-AAAU-10A-01D-A41E-01 6 -1.2009
chr1 879511 894690 TCGA-3C-AAAU-10A-01D-A41E-01 12002 0.0055
chr1 201474400 201474544 TCGA-3C-AAAU-10A-01D-A41E-01 2 -1.4235
chr1 201475220 247813706 TCGA-3C-AAAU-10A-01D-A41E-01 29781 -4.00E-04

B.bed

1
2
3
4
5
6
7
8
9
10
chr1	69091	70008	OR4F5
chr1 367640 368634 OR4F29
chr1 621096 622034 OR4F16
chr1 859308 879961 SAMD11
chr1 879584 894689 NOC2L
chr1 895967 901095 KLHL17
chr1 901877 911245 PLEKHN1
chr1 910584 917473 PERM1
chr1 934342 935552 HES4
chr1 936518 949921 ISG15
1
bedtools intersect -a A.bed  -b  B.bed -wa -wb | bedtools groupby -i - -g 1-4 -c 10 -o collapse
1
2
chr1	69200	70000	TCGA-3C-AAAU-10A-01D-A41E-01	OR4F5
chr1 879511 894690 TCGA-3C-AAAU-10A-01D-A41E-01 SAMD11,NOC2L

需要这样的一个文件

1
2
3
4
5
chr1	69091	70008	exon
chr1 367640 368634 intron
chr1 621096 622034 promoter
chr1 859308 879961 intergenic
....
1
bedtools intersect -a CpG.hyper.bed  -b  csi.bed -wa -wb | bedtools groupby -i - -g 1-4 -c 10 -o collapse > result.bed

实践

  1. 用Galaxy 工具 gff-to-bed将gff3 文件转成bed,然后提取前四列
1
2
3
4
5
6
7
8
9
10
11
12
[qizhengyang@node1 bedtest]$ cat Galaxy3-\[GFF-to-BED_on_data_1\].bed | cut -f1,2,3,4 > csi.bed
[qizhengyang@node1 bedtest]$ head csi.bed
chr4 17943 22804 gene
chr4 17943 22804 mRNA
chr4 17943 18472 five_prime_UTR
chr4 17943 18544 exon
chr4 18472 18544 CDS
chr4 18917 19169 exon
chr4 18917 19169 CDS
chr4 19540 19817 exon
chr4 19540 19817 CDS
chr4 21013 21168 exon
  1. 处理CpG_regions_myDiff25p.hyper.txt
1
2
3
4
5
6
7
8
9
10
11
qizhengyang@node1 bedtest]$ cat CpG_regions_myDiff25p.hyper.txt |cut -f1,2,3
chr start end
chr1 36001 36100
chr1 101501 101600
chr1 162601 162700
chr1 386101 386200
chr1 425401 425500
chr1 435301 435400
chr1 440601 440700
chr1 440701 440800
chr1 440801 440900
  1. 用bedtools工具进行注释
1
2
3
4
5
6
7
8
9
10
11
12
[qizhengyang@node1 bedtest]$ bedtools intersect -a CpG.hyper.bed  -b  csi.bed -wa -wb | bedtools groupby -i - -g 1-3 -c 7 -o collapse > result1.bed
[qizhengyang@node1 bedtest]$ head result1.bed
chr1 386101 386200 gene,mRNA,exon,three_prime_UTR,gene,mRNA,exon,three_prime_UTR,mRNA,exon,three_prime_UTR,mRNA,exon,three_prime_UTR
chr1 501101 501200 gene,mRNA
chr1 501201 501300 gene,mRNA
chr1 501401 501500 gene,mRNA
chr1 691601 691700 gene,mRNA
chr1 723301 723400 gene,mRNA,exon,three_prime_UTR
chr1 736701 736800 gene,mRNA,exon,three_prime_UTR
chr1 758501 758600 gene,mRNA,mRNA,mRNA
chr1 1643701 1643800 exon,three_prime_UTR,gene,mRNA
chr1 2044401 2044500 gene,mRNA,mRNA,mRNA,mRNA

bedtools merge

案例三:-d 两个独立区域间距小于(等于)该值时将被合并为一个区域;-o collapse显示合并了哪些标签

https://anjingwd.github.io/AnJingwd.github.io/2017/08/19/bedtools%E4%BD%BF%E7%94%A8%E6%95%99%E7%A8%8B%E8%AF%A6%E8%A7%A3/

1
2
3
4
chr1	36001	36100
chr1 36101 36108
chr1 162601 162700
chr1 386101 386200
1
bedtools merge -i test.merge -d 5 -c 1 -o count,collapse
1
2
3
chr1	36001	36108	2	chr1,chr1
chr1 162601 162700 1 chr1
chr1 386101 386200 1 chr1
1
2
3
4
5
6
7
8
# 只处理前三列,不会处理第一行
bedtools merge -i CpG_regions_myDiff25p.hyper.txt -d 5 -c 1 -o count,collapse > CpG.merge.hyper.txt
cat CpG.merge.hyper.txt |cut -f1,2,3 > CpG.merge.hyper.bed

# -b不需要考虑顺序,-c 7 代表 基因特征的这一列(应该是这个程序会把a、b文件合并), -g 应该是a文件的列数
bedtools intersect -a CpG.merge.hyper.bed -b Galaxy3-[GFF-to-BED_on_data_1].bed -wa -wb | bedtools groupby -i - -g 1-3 -c 10 -o collapse | less

bedtools intersect -a CpG.merge.hyper.bed -b Galaxy4-\[GFF-to-BED_on_data_3\].bed -wa -wb | bedtools groupby -i - -g 1-3 -c 7 -o collapse > result3.bed

CpG_regions_myDiff25p.hyper.txt

1
2
3
4
5
6
chr     start   end     strand  pvalue  qvalue  meth.diff
chr1 36001 36100 * 7.64041300115147e-07 2.95924501404506e-05 26.027397260274
chr1 101501 101600 * 7.9298876929699e-11 7.72182883208429e-09 33.7349397590361
chr1 162601 162700 * 4.35900104458979e-16 1.09149763553095e-13 46.6709760827408
chr1 386101 386200 * 1.168443361622e-08 7.19208543362641e-07 37.7489177489177
chr1 425401 425500 * 6.76995294656392e-07 2.66176753139068e-05 39.3233082706767

Galaxy3-[GFF-to-BED_on_data_1].bed

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
chr4    17943   22804   gene    0       +
chr4 17943 22804 mRNA 0 +
chr4 17943 18472 five_prime_UTR 0 +
chr4 17943 18544 exon 0 +
chr4 18472 18544 CDS 0 +
chr4 18917 19169 exon 0 +
chr4 18917 19169 CDS 0 +
chr4 19540 19817 exon 0 +
chr4 19540 19817 CDS 0 +
chr4 21013 21168 exon 0 +
chr4 21013 21168 CDS 0 +
chr4 22041 22804 exon 0 +
chr4 22041 22182 CDS 0 +
chr4 22182 22804 three_prime_UTR 0 +
chr4 17943 22804 mRNA 0 +
chr4 17943 18472 five_prime_UTR 0 +
chr4 17943 18544 exon 0 +

流程:

1
2
3
4
bedtools merge -i CpG_regions_myDiff25p.hyper.txt -d 5 -c 1 -o count,collapse > CpG.merge.hyper.txt

# 可以用CpG.merge.hyper.txt这个merge之后的文件,-c 9
bedtools intersect -a CpG.merge.hyper.txt -b Galaxy4-\[GFF-to-BED_on_data_3\].bed -wa -wb | bedtools groupby -i - -g 1-3 -c 9 -o collapse | less

1
2
3
4
5
6
7
8
9
10
11
12
13
14
dos2unix csi.promoter.txt
cut csi.promoter.txt -f1,3,4 > csi.promoter.bed
sed -i 's/$/\tpromoter/g' csi.promoter.bed


cut Galaxy4-\[GFF-to-BED_on_data_3\].bed -f1,2,3,4 > te.bed

# -v OFS 指定输出分隔符
cat Galaxy3-\[GFF-to-BED_on_data_1\].bed | awk -F"\t" -v OFS="\t" '$4=="gene"{print $1,$2,$3,$4}' > gene.bed


cat te.bed gene.bed csi.promoter.bed > csi.merge.bed

bedtools intersect -a CpG.merge.hyper.txt -b csi.merge.bed -wa -wb | bedtools groupby -i - -g 1-3 -c 9 -o collapse > CpG.annotation.txt

然后写python脚本清理信息

1
2
3
4
5
6
7
8
9
10
11
12
with open('DMR_annotation.txt','w') as a:
with open('CpG.annotation.txt', 'r') as f:
for line in f:
line = line.strip().split()
if 'promoter' in line[3]:
line = '\t'.join(line[0:3]) + '\t' + 'promoter' + '\n'
elif 'gene' in line[3]:
line = '\t'.join(line[0:3]) + '\t' + 'gene' + '\n'
else:
line = '\t'.join(line[0:3]) + '\t' + 'TE' + '\n'
print(line)
a.write(line)
1
2
3
4
5
6
7
8
9
10
11
chr1	36001	36100	TE

chr1 101501 101600 promoter

chr1 162601 162700 TE

chr1 386101 386200 gene

chr1 435301 435400 TE

chr1 440601 440900 TE