> #作業ディレクトリの設定。⌘Dでも設定できる。 > setwd("~/Dropbox (RNA_Biol)/blog") > > #文字列操作などに便利なライブラリーstringrの読み込み > library(stringr) > #fasta形式などのファイル読み込みに便利なライブラリーBiostringsの読み込み > library(Biostrings) > > #データフレームhg38_kgに遺伝子リスト&エクソン数を読み込み > hg38_kg<-read.delim("hg38_kg.txt") > head(hg38_kg) #中身の確認 X.name exonCount 1 uc031tla.1 1 2 uc057aty.1 3 3 uc057atz.1 2 4 uc031tlb.1 1 5 uc001aak.4 3 6 uc057aua.1 2 > > #データフレームhg38_seqにfasta形式の転写産物リストを読み込み > hg38_seq<-readDNAStringSet("hg38_kg_seq_fa.txt") > head(hg38_seq) #中身の確認 A DNAStringSet instance of length 6 width seq names [1] 68 TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCTGACCCCCATGTCGCCTCTGTAG uc031tla.1 [2] 712 GTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCCTACCC...CACCTTGAACTTGGACTTCCAAGCCTCCAGAACTGTGAGGGATAAATGTAT uc057aty.1 [3] 535 TCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCTGGAATCCGTTGGCTTGCC...GGACTTCCAAGCCTCCAGAACTGTGAGGGATAAATGTATGATTTTAAAGTC uc057atz.1 [4] 138 GGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCGTAGCA...GCTAAAAAACATTATTGGTTGTTTATCTGAGATTCAGAATTAAGCATTTTA uc031tlb.1 [5] 1187 CACACAACGGGGTTTCGGGGCTGTGGACCCTGTGCCAGGAAAGGAAGGGCG...TATAGTTGGTTACAGCAACTGCCTTCTTTTAATTAAAACACTCCTGCTGCT uc001aak.4 [6] 590 GGGGTTTCGGGGCTGTGGACCCTGTGCCAGGAAAGGAAGGGCGCAGCTCCT...TTTTCCCCACCAAATTCTTGTAAGTATTAAACATTGTATATGTATTTTGAA uc057aua.1 > > #遺伝子名の列として"ID"、エクソン数の列として"exon"、転写産物の長さの列として"length"というラベルを持つデータフレームhg38_infoを作る > hg38_info<-data.frame(ID=hg38_kg$X.name,exon=hg38_kg$exonCount,length=width(hg38_seq)) > head(hg38_info) #中身の確認 ID exon length 1 uc031tla.1 1 68 2 uc057aty.1 3 712 3 uc057atz.1 2 535 4 uc031tlb.1 1 138 5 uc001aak.4 3 1187 6 uc057aua.1 2 590 > > #データフレームhg38_utr3に3' UTRの情報を読み込む > hg38_utr3<-read.delim("hg38_kg_utr3.txt") > head(hg38_utr3) #中身の確認 X.name 1 uc001abw.2 2 uc001abz.5 3 uc001aca.3 4 uc001acd.4 5 uc001ace.4 6 uc001acf.4 seq 1 GGUUGCCGGGGGUAGGGGUGGGGCCACACAAAUCUCCAGGAGCCACCACUCAACACAAUGGCCCUGCCUCCCACCGCUUUAUUUCUUUCGGUUUCGGAUGCAAAACAAAAAAUUUUAAAAGAAAAUGUGACUUCAAAGGAAAGGAACAAAUUUUCAAAGACUUGGGGGAGUGAAGGCAGAGCCUGGUGCAGAUGGACGAGGUCUGCAGACGGAGGGCAGAGGUGGUGGAAGGGGCCAGGGGCCUGCAGGCCUCCCCCUGGAACUGGGACUGGUCUCGGUCUGCUGACGUCAGGGUCAGCUCCCCCGCGGAGCUGACUUCAGCAGCCCACAGCUGUGGGGCUUCAGCAGCCACACCAGCCCAGCCCAGCCCAGCUCUCGAUACGUUUGGUCUUUCAUGCUGAAAAAUAAAUAAUAAAGCCUGU 2 GGCAGCCCAUCUGGGGGGCCUGUAGGGGCUGCCGGGCUGGUGGCCAGUGUUUCCACCUCCCUGGCAGUCAGGCCUAGAGGCUGGCGUCUGUGCAGUUGGGGGAGGCAGUAGACACGGGACAGGCUUUAUUAUUUAUUUUUCAGCAUGAAAGACCAAACGUAUCGAGAGCUGGGCUGGGCUGGGCUGGUGUGGCUGCUGAAGCCCCACAGCUGUGGGCUGCUGAAGUCAGCUCCGCGGGGGAGCUGACCCUGACGUCAGCAGACCGAGACCAGUCCCAGUUCCAGGGGGAGGCCUGCAGGCCCCUGGCCCCUUCCACCACCUCUGCCCUCCGUCUGCAGACCUCGUCCAUCUGCACCAGGCUCUGCCUUCACUCCCCCAAGUCUUUGAAAAUUUGUUCCUUUCCUUUGAAGUCACAUUUUCUUUUAAAAUUUUUUGUUUUGCAUCCGAAACCGAAAGAAAUAAAGCGGUGGGAGGCAGGGCCAUUGUGUUG 3 CCCACCUACCACCAGAGGCCUGCAGCCUCCCACAUGCCUUAAGGGGACCGUGGCCCCCACCAGGGACGUCCUGCGCCAUCCGUUCACGUCUCUGCAUCCAUUCCUUCAUGUCUUUAUUUAGUUGUUUAUUUAUUUAGUUAUUUAUCUUAUUUAUUGAGGGGUGAGGAGUGCCACGGCUGCCCGUUUACACCUUUAGCGUCUGGUCCUCCUGCGUGUCCUCCCCUCCACUGCCUGCAUGGGGGGCGCGGGGAGUGACCAGGCGGGGGCCUCACCGCCCCAGGGCCGUUGCCUGCUCAGACCUUGCAGGCUGUGGAGCAAGAGGCCCUGGGUCUCUCCAAGCAGCUGCAGACCCCAGCUCGAAUUUUGCACAUGGCGGGGUCCCGGGAAGGGUGGGGAGCAGUUGUCCUUCCUGUCGUCGUCUGCCGUGUGCCAUCUUUCCUGGAUCUUGUAGUGGGUGCACACGCGUGCACUGGGACCCCACACAGCAAUACGAGUCCAACUUAAUAAACACAUUUCUGGGGUUC 4 UGGCCGCGGUGAGGUGGGUUCUCAGGACCACCCUCGCCAAGCUCCAGGGUACCUGCCCCUCUAACCCACUUCAAAUUACAAGUCAGGGUCUGAACCCAGUGUGAUGGGGGGAGUCUCUGGGGCCCUGAGUUCAGAGCCCGUCCCUCAGCUCCUGUUCCUUGGUGCCAGCAGCUGGGGCAGGGAAGGGUGGGAGGGGCCCCAUCCAAAGGAUGCCCUGGCCAGCGAGGCUGGGUCACAGGUCAGGGAGGUCCUGGCCGUCCACAGGGUCGGCCCUCAGCUCAGCCCGCCAGGAGUCAGGGAGGAGACUCGCUGGGAGUGGGAGGGCAGCACGGGCGUGAAGGUCGGAGGACAGAGAAAGGUCAGCAGGGUCAGAGUAUGUGAGGUCAGAGGGCAUGAGGGUCACAGGUCAGCAAGGUGUGAGGAGCACAAGCCAGGGUGCCCCGAGGAGGAGGGUGGGUGGGUCCUUGUGUGGCCUGGCGCGCACCACAGGGCAGCACGGGAGACGUUGACACCACCGGACGAGAAAGAAAAAA 5 UGGCCGCGGUGAGGUGGGUUCUCAGGACCACCCUCGCCAAGCUCCAGGGUACCUGCCCCUCUAACCCACUUCAAAUUACAAGUCAGGGUCUGAACCCAGUGUGAUGGGGGGAGUCUCUGGGGCCCUGAGUUCAGAGCCCGUCCCUCAGCUCCUGUUCCUUGGUGCCAGCAGCUGGGGCAGGGAAGGGUGGGAGGGGCCCCAUCCAAAGGAUGCCCUGGCCAGCGAGGCUGGGUCACAGGUCAGGGAGGUCCUGGCCGUCCACAGGGUCGGCCCUCAGCUCAGCCCGCCAGGAGUCAGGGAGGAGACUCGCUGGGAGUGGGAGGGCAGCACGGGCGUGAAGGUCGGAGGACAGAGAAAGGUCAGCAGGGUCAGAGUAUGUGAGGUCAGAGGGCAUGAGGGUCACAGGUCAGCAAGGUGUGAGGAGCACAAGCCA 6 UGGCCGCGGUGAGGUGGGUUCUCAGGACCACCCUCGCCAAGCUCCAGGGUACCUGCCCCUCUAACCCACUUCAAAUUACAAGUCAGGGUCUGAACCCAGUGUGAUGGGGGGAGUCUCUGGGGCCCUGAGUUCAGAGCCCGUCCCUCAGCUCCUGUUCCUUGGUGCCAGCAGCUGGGGCAGGGAAGGGUGGGAGGGGCCCCAUCCAAAGGAUGCCCUGGCCAGCGAGGCUGGGUCACAGGUCAGGGAGGUCCUGGCCGUCCACAGGGUCGGCCCUCAGCUCAGCCCGCCAGGAGUCAGGGAGGAGACUCGCUGGGAGUGGGAGGGCAGCACGGGCGUGAAGGUCGGAGGACAGAGAAAGGUCAGCAGGGUCAGAGUAUGUGAGGUCAGAGGGCAUGAGGGUCACAGGUCAGCAAGGUGUGAGGAGCACAAGCCA > #データフレームhg38_utr3に"length"という列を追加し、"seq"の列にある配列の長さをカウントした数を入れる > hg38_utr3["length"]<-str_length(hg38_utr3$seq) > head(hg38_utr3) #中身の確認 X.name 1 uc001abw.2 2 uc001abz.5 3 uc001aca.3 4 uc001acd.4 5 uc001ace.4 6 uc001acf.4 seq 1 GGUUGCCGGGGGUAGGGGUGGGGCCACACAAAUCUCCAGGAGCCACCACUCAACACAAUGGCCCUGCCUCCCACCGCUUUAUUUCUUUCGGUUUCGGAUGCAAAACAAAAAAUUUUAAAAGAAAAUGUGACUUCAAAGGAAAGGAACAAAUUUUCAAAGACUUGGGGGAGUGAAGGCAGAGCCUGGUGCAGAUGGACGAGGUCUGCAGACGGAGGGCAGAGGUGGUGGAAGGGGCCAGGGGCCUGCAGGCCUCCCCCUGGAACUGGGACUGGUCUCGGUCUGCUGACGUCAGGGUCAGCUCCCCCGCGGAGCUGACUUCAGCAGCCCACAGCUGUGGGGCUUCAGCAGCCACACCAGCCCAGCCCAGCCCAGCUCUCGAUACGUUUGGUCUUUCAUGCUGAAAAAUAAAUAAUAAAGCCUGU 2 GGCAGCCCAUCUGGGGGGCCUGUAGGGGCUGCCGGGCUGGUGGCCAGUGUUUCCACCUCCCUGGCAGUCAGGCCUAGAGGCUGGCGUCUGUGCAGUUGGGGGAGGCAGUAGACACGGGACAGGCUUUAUUAUUUAUUUUUCAGCAUGAAAGACCAAACGUAUCGAGAGCUGGGCUGGGCUGGGCUGGUGUGGCUGCUGAAGCCCCACAGCUGUGGGCUGCUGAAGUCAGCUCCGCGGGGGAGCUGACCCUGACGUCAGCAGACCGAGACCAGUCCCAGUUCCAGGGGGAGGCCUGCAGGCCCCUGGCCCCUUCCACCACCUCUGCCCUCCGUCUGCAGACCUCGUCCAUCUGCACCAGGCUCUGCCUUCACUCCCCCAAGUCUUUGAAAAUUUGUUCCUUUCCUUUGAAGUCACAUUUUCUUUUAAAAUUUUUUGUUUUGCAUCCGAAACCGAAAGAAAUAAAGCGGUGGGAGGCAGGGCCAUUGUGUUG 3 CCCACCUACCACCAGAGGCCUGCAGCCUCCCACAUGCCUUAAGGGGACCGUGGCCCCCACCAGGGACGUCCUGCGCCAUCCGUUCACGUCUCUGCAUCCAUUCCUUCAUGUCUUUAUUUAGUUGUUUAUUUAUUUAGUUAUUUAUCUUAUUUAUUGAGGGGUGAGGAGUGCCACGGCUGCCCGUUUACACCUUUAGCGUCUGGUCCUCCUGCGUGUCCUCCCCUCCACUGCCUGCAUGGGGGGCGCGGGGAGUGACCAGGCGGGGGCCUCACCGCCCCAGGGCCGUUGCCUGCUCAGACCUUGCAGGCUGUGGAGCAAGAGGCCCUGGGUCUCUCCAAGCAGCUGCAGACCCCAGCUCGAAUUUUGCACAUGGCGGGGUCCCGGGAAGGGUGGGGAGCAGUUGUCCUUCCUGUCGUCGUCUGCCGUGUGCCAUCUUUCCUGGAUCUUGUAGUGGGUGCACACGCGUGCACUGGGACCCCACACAGCAAUACGAGUCCAACUUAAUAAACACAUUUCUGGGGUUC 4 UGGCCGCGGUGAGGUGGGUUCUCAGGACCACCCUCGCCAAGCUCCAGGGUACCUGCCCCUCUAACCCACUUCAAAUUACAAGUCAGGGUCUGAACCCAGUGUGAUGGGGGGAGUCUCUGGGGCCCUGAGUUCAGAGCCCGUCCCUCAGCUCCUGUUCCUUGGUGCCAGCAGCUGGGGCAGGGAAGGGUGGGAGGGGCCCCAUCCAAAGGAUGCCCUGGCCAGCGAGGCUGGGUCACAGGUCAGGGAGGUCCUGGCCGUCCACAGGGUCGGCCCUCAGCUCAGCCCGCCAGGAGUCAGGGAGGAGACUCGCUGGGAGUGGGAGGGCAGCACGGGCGUGAAGGUCGGAGGACAGAGAAAGGUCAGCAGGGUCAGAGUAUGUGAGGUCAGAGGGCAUGAGGGUCACAGGUCAGCAAGGUGUGAGGAGCACAAGCCAGGGUGCCCCGAGGAGGAGGGUGGGUGGGUCCUUGUGUGGCCUGGCGCGCACCACAGGGCAGCACGGGAGACGUUGACACCACCGGACGAGAAAGAAAAAA 5 UGGCCGCGGUGAGGUGGGUUCUCAGGACCACCCUCGCCAAGCUCCAGGGUACCUGCCCCUCUAACCCACUUCAAAUUACAAGUCAGGGUCUGAACCCAGUGUGAUGGGGGGAGUCUCUGGGGCCCUGAGUUCAGAGCCCGUCCCUCAGCUCCUGUUCCUUGGUGCCAGCAGCUGGGGCAGGGAAGGGUGGGAGGGGCCCCAUCCAAAGGAUGCCCUGGCCAGCGAGGCUGGGUCACAGGUCAGGGAGGUCCUGGCCGUCCACAGGGUCGGCCCUCAGCUCAGCCCGCCAGGAGUCAGGGAGGAGACUCGCUGGGAGUGGGAGGGCAGCACGGGCGUGAAGGUCGGAGGACAGAGAAAGGUCAGCAGGGUCAGAGUAUGUGAGGUCAGAGGGCAUGAGGGUCACAGGUCAGCAAGGUGUGAGGAGCACAAGCCA 6 UGGCCGCGGUGAGGUGGGUUCUCAGGACCACCCUCGCCAAGCUCCAGGGUACCUGCCCCUCUAACCCACUUCAAAUUACAAGUCAGGGUCUGAACCCAGUGUGAUGGGGGGAGUCUCUGGGGCCCUGAGUUCAGAGCCCGUCCCUCAGCUCCUGUUCCUUGGUGCCAGCAGCUGGGGCAGGGAAGGGUGGGAGGGGCCCCAUCCAAAGGAUGCCCUGGCCAGCGAGGCUGGGUCACAGGUCAGGGAGGUCCUGGCCGUCCACAGGGUCGGCCCUCAGCUCAGCCCGCCAGGAGUCAGGGAGGAGACUCGCUGGGAGUGGGAGGGCAGCACGGGCGUGAAGGUCGGAGGACAGAGAAAGGUCAGCAGGGUCAGAGUAUGUGAGGUCAGAGGGCAUGAGGGUCACAGGUCAGCAAGGUGUGAGGAGCACAAGCCA length 1 422 2 490 3 524 4 533 5 433 6 433 > > #データフレームhg38_utr3に5' UTRの情報を読み込む > hg38_utr5<-read.delim("hg38_kg_utr5.txt") > head(hg38_utr5) #中身の確認 X.name seq 1 uc001abw.2 CCAGCAGAUCCCUGCGGCGUUCGCGAGGGUGGGACGGGAAGCGGGCUGGGAAGUCGGGCCGAGGGAAAAGUCUGAAGACGCUU 2 uc001abz.5 CGCGCGGCAUUCUGGGGCCGGAAGUGGGGUGCACGCUUCGGGUUGGUGUC 3 uc001aca.3 AGUGAGCGACACAGAGCGGGCCGCCACCGCCGAGCAGCCCUCCGGCAGUCUCCGCGUCCGUUAAGCCCGCGGGUCCUCCGCGAAUCGGCGGUGGGUCCGGCAGCCGA 4 uc001acd.4 GGGACCCAGACUUGCCGACCUGUACGACUCUGGCC 5 uc001ace.4 CCAGACUUGCCGACCUGUACGACUCUGGCC 6 uc001acf.4 CCAGACUUGCCGACCUGUACGACUCUGGCC > #データフレームhg38_utr5に"length"という列を追加し、"seq"の列にある配列の長さをカウントした数を入れる > hg38_utr5["length"]<-str_length(hg38_utr5$seq) > head(hg38_utr5) #中身の確認 X.name seq length 1 uc001abw.2 CCAGCAGAUCCCUGCGGCGUUCGCGAGGGUGGGACGGGAAGCGGGCUGGGAAGUCGGGCCGAGGGAAAAGUCUGAAGACGCUU 83 2 uc001abz.5 CGCGCGGCAUUCUGGGGCCGGAAGUGGGGUGCACGCUUCGGGUUGGUGUC 50 3 uc001aca.3 AGUGAGCGACACAGAGCGGGCCGCCACCGCCGAGCAGCCCUCCGGCAGUCUCCGCGUCCGUUAAGCCCGCGGGUCCUCCGCGAAUCGGCGGUGGGUCCGGCAGCCGA 107 4 uc001acd.4 GGGACCCAGACUUGCCGACCUGUACGACUCUGGCC 35 5 uc001ace.4 CCAGACUUGCCGACCUGUACGACUCUGGCC 30 6 uc001acf.4 CCAGACUUGCCGACCUGUACGACUCUGGCC 30 > > #データフレームhg_infoに"utr5"という列を追加し、hg_infoの"ID"の列とhg38_utr5の"X.name"の列の値が一致する時にhg38_utr5の"length"の列の値を入れる。 > hg38_info["utr5"]<-hg38_utr5$length[match(hg38_info$ID,hg38_utr5$X.name)] > head(hg38_info) #中身の確認 ID exon length utr5 1 uc031tla.1 1 68 NA 2 uc057aty.1 3 712 NA 3 uc057atz.1 2 535 NA 4 uc031tlb.1 1 138 NA 5 uc001aak.4 3 1187 NA 6 uc057aua.1 2 590 NA > > #データフレームhg_infoに"utr3"という列を追加し、hg_infoの"ID"の列とhg38_utr3の"X.name"の列の値が一致する時にhg38_utr3の"length"の列の値を入れる。 > hg38_info["utr3"]<-hg38_utr3$length[match(hg38_info$ID,hg38_utr3$X.name)] > head(hg38_info) #中身の確認 ID exon length utr5 utr3 1 uc031tla.1 1 68 NA NA 2 uc057aty.1 3 712 NA NA 3 uc057atz.1 2 535 NA NA 4 uc031tlb.1 1 138 NA NA 5 uc001aak.4 3 1187 NA NA 6 uc057aua.1 2 590 NA NA > > #テーブルの書き出し > write.table(hg38_info,"hg38_info.txt",quote=F,col.names=T, row.names=F,sep="\t") >