package genetic_code_pak; -- collection of genetic code utilities, including unigene analysis const genetic_code := {["TTT","Phe","F"], ["TTC","Phe","F"], ["TTA","Leu","L"], ["TTG","Leu","L"], ["TCT","Ser","S"], ["TCC","Ser","S"], ["TCA","Ser","S"], ["TCG","Ser","S"], ["TAT","Tyr","Y"], ["TAC","Tyr","Y"], ["TAA","*","\r"], ["TAG","*","\r"], ["TGT","Cys","C"], ["TGC","Cys","C"], ["TGA","*","\r"], ["TGG","Trp","W"], ["CTT","Leu","L"], ["CTC","Leu","L"], ["CTA","Leu","L"], ["CTG","Leu","L"], ["CCT","Pro","P"], ["CCC","Pro","P"], ["CCA","Pro","P"], ["CCG","Pro","P"], ["CAT","His","H"], ["CAC","His","H"], ["CAA","Gln","Q"], ["CAG","Gln","Q"], ["CGT","Arg","R"], ["CGC","Arg","R"], ["CGA","Arg","R"], ["CGG","Arg","R"], ["ATT","Ile","I"], ["ATC","Ile","I"], ["ATA","Ile","I"], ["ATG","Met","M"], ["ACT","Thr","T"], ["ACC","Thr","T"], ["ACA","Thr","T"], ["ACG","Thr","T"], ["AAT","Asn","N"], ["AAC","Asn","N"], ["AAA","Lys","K"], ["AAG","Lys","K"], ["AGT","Ser","S"], ["AGC","Ser","S"], ["AGA","Arg","R"], ["AGG","Arg","R"], ["GTT","Val","V"], ["GTC","Val","V"], ["GTA","Val","V"], ["GTG","Val","V"], ["GCT","Ala","A"], ["GCC","Ala","A"], ["GCA","Ala","A"], ["GCG","Ala","A"], ["GAT","Asp","D"], ["GAC","Asp","D"], ["GAA","Glu","E"], ["GAG","Glu","E"], ["GGT","Gly","G"], ["GGC","Gly","G"], ["GGA","Gly","G"], ["GGG","Gly","G"]}; const complement := {["a","t"],["t","a"],["c","g"],["g","c"],["A","T"],["T","A"],["C","G"],["G","C"]}; -- Oily: AGVLIPPMWC Hydrocarbon or SH, one nitrogen in ring (nonpairing) -- Polar: NQSTV CONH2 or OH (self-pairing) -- Pos: RKH NH2 or double N in ring (pairs with complement) -- Neg: DE COOH (pairs with complement) const protein_quality := {["A","h"],["G","h"],["V","h"],["L","h"],["I","h"],["P","h"],["F","h"],["M","h"], ["W","h"],["C","h"],["N","n"],["Q","n"],["S","n"],["T","n"],["Y","n"], ["R","p"],["K","p"],["H","p"],["D","a"],["E","a"]}; -- h = hydrocarbon, a = acid, b = basic, n = polar neutral, u = unknown -- paths to files used and written var bioinf_course_prefix := "Diana:Diana2003:Pub:FromGiuseppeLATEST:SetlFolder:bioinformatics_course:"; var to_genome_lengths_prefix := bioinf_course_prefix + "unigene:Unigene_analysis_and_psls:genome_lengths:"; var to_raw_unigene_prefix := bioinf_course_prefix + "unigene:"; var gb_active_ix_flat := OM; -- the index flat to the genbank active file var gb_active_file_handle := OM; -- handle for the genbank active file -- ***** Legend for restiction enymes data -- 3.2. Purine (adenine or guanine): R -- 3.3. Pyrimidine (thymine or cytosine): Y -- 3.4. Adenine or thymine: W -- 3.5. Guanine or cytosine: S -- 3.6. Adenine or cytosine: M -- 3.7. Guanine or thymine: K -- 3.8. Adenine or thymine or cytosine: H -- 3.9. Guanine or cytosine or thymine: B -- 3.10. Guanine or adenine or cytosine: V -- 3.11. Guanine or adenine or thymine: D -- 3.12. Guanine or adenine or thymine or cytosine: N const base_weights := {["A",329.2],["T",306.2],["U",306.2],["C",305.2],["G",345.2],["N",329.2]}; const peptide_weights := {["A",129.1],["R",156.2],["N",114.1],["D",115.1], ["C",103.1],["Q",128.1],["E",129.1],["G",57.1], ["H",137.1],["I",113.1],["L",113.1],["K",128.2],["M",131.2],["F",147.2], ["P",97.1],["S",87.1],["T",101.1],["W",186.2],["Y",163.2],["V",99.1],["B",132.6],["Z",146.6]}; const restriction_enzymes_data := {["Aar_I", "AGGCCTN4^N4"], ["Aas_I", "GACNNNN^NNGTC"], ["Aat_I", "AGG^CCT"], ["Aat_II", "GACGT^C"], ["Aau_I", "T^GTACA"], ["Acc_I", "GT^MKAC"], ["Acc_II", "CG^CG"], ["Acc_III", "T^CCGGA"], ["Acc16_I", "TGC^GCA"], ["Acc36_I", "ACCTGC(N)4^(N)4"], ["Acc65_I", "G^GTSCC"], ["Acc113_I", "HGT^ACT"], ["AccB1_I", "G^GYRCC"], ["AccB7_I", "CCA(N)4^NTGG"], ["AccBS_I", "CCG^CTC"], ["Aci_I", "C^CGC"], ["Acl_I", "AA^CGTT"], ["AclW_I", "GGATC(N)4^"], ["Acs_I", "R^AATTY"], ["Acu_I", "CTGAAG(N)16^"], ["Acv_I", "CAC^GTG"], ["Acy_I", "GR^CGYC"], ["Ade_I", "CACNNN^GTG"], ["Afa_I", "GT^AC"], ["Afe_I", "HGC^GCT"], ["Afl_II", "C^TTSHG"], ["Afl_III", "A^CRYGT"], ["Age_I", "A^CCGGT"], ["Ahd_I", "GACNNN^NNGTC"], ["Ahl_I", "A^CTAGT"], ["Ale_I", "CACNN^NNGTG"], ["Alo_I", "(N)5^(N)7GAAC(N)6TCC(N)12^"], ["Alu_I", "HG^CT"], ["Alw_I", "GGATC(N)4^"], ["Alw21_I", "GWCCW^C"], ["Alw26_I", "GTCTCN^"], ["Alw44_I", "G^TGCAC"], ["AlwN_I", "CHGNNN^CTG"], ["Ama87_I", "C^YCGRG"], ["Aor51H_I", "HGC^GCT"], ["Apa_I", "GGGCC^C"], ["ApaL_I", "G^TGCAC"], ["Apo_I", "R^AATTY"], ["Asc_I", "GG^CGCGCC"], ["Ase_I", "AT^TSAT"], ["AsiA_I", "A^CCGGT"], ["AsiS_I", "GCGAT^CGC"], ["Asp_I", "GACN^NNGTC"], ["Asp700_I", "GAANN^NNTTC"], ["Asp718_I", "G^GTSCC"], ["AspA2_I", "C^CTAGG"], ["AspE_I", "GACNNN^NNGTC"], ["AspH_I", "GWGCW^C"], ["AspLE_I", "GCG^C"], ["AspS9_I", "G^GNCC"], ["Asu_II", "TT^CGAA"], ["AsuC2_I", "CC^SGG"], ["AsuHP_I", "GGTGA(N)8^"], ["AsuNH_I", "G^CTHGC"], ["Ava_I", "C^YCGRG"], ["Ava_II", "G^GWCC"], ["Avi_II", "TGC^GCA"], ["Avr_II", "C^CTHGG"], ["Axy_I", "CC^TNHGG"], ["Bae_I", "^(N)10AC(N)4"], ["Bal_I", "TGG^CCA"], ["BamH_I", "G^GATCC"], ["Ban_I", "G^GYRCC"], ["Ban_II", "GRGCY^C"], ["Ban_III", "AT^CGAT"], ["Bbe_I", "GGCGC^C"], ["BbrP_I", "CAC^GTC"], ["Bbs_I", "GAHGACNN^"], ["Bbu_I", "GCATG^C"], ["Bbv_I", "GCHGC(N)8^"], ["Bbv12_I", "GWGCW^C"], ["BbvC_I", "CC^TCHGC"], ["Bcc_I", "CCATCNNNN^N"], ["BceA_I", "ACGGC(N)12^(N)2"], ["Bcg_I", "^(N)10CGA(N)6TGC(N)12^"], ["BciV_I", "GTSTCC(N)6^"], ["Bcl_I", "T^GATCA"], ["Bcn_I", "CC^SGG"], ["Bcu_I", "A^CTHGT"], ["Bfa_I", "C^THG"], ["Bfi_I", "ACTGGG(N)5^"], ["Bfm_I", "C^TRYHG"], ["Bfr_I", "C^TTAAG"], ["BfrB_I", "ATG^CAT"], ["Bfu_I", "GTATCC(N)6^"], ["BfuA_I", "ACCTGCNNNN^NNNN"], ["BfuC_I", "^GATC"], ["Bgl_I", "GCC(N)4^NGGC"], ["Bgl_II", "A^GATCT"], ["Bln_I", "C^CTHGG"], ["Blp_I", "GC^TNHGC"], ["Bme18_I", "G^GWCC"], ["Bme1390_I", "CC^NGG"], ["Bme1580_I", "GKGCM^C"], ["BmgB_I", "GAC^GTC"], ["Bmr_I", "ACTGGG"], ["Bmt_I", "GCTAG^C"], ["Bmy_I", "GDGCH^C"], ["Box_I", "GACNN^NNGTC"], ["Bpi_I", "GAHGACNN^"], ["Bpl_I", "^(N)8GHG(N)5CTC(N)13^"], ["Bpm_I", "CTGGHG(N)16^"], ["Bpu10_I", "CC^TNHGC"], ["Bpu14_I", "TT^CGAA"], ["Bpu1102_I", "GC^TNHGC"], ["BpuA_I", "GAHGACNN^"], ["BpuE_I", "CTTGAG(N)16^"], ["Bsa_I", "GGTCTCN^"], ["Bsa29_I", "AT^CGAT"], ["BsaA_I", "YAC^GTR"], ["BsaB_I", "GATNN^NNATC"], ["BsaH_I", "GR^CGYC"], ["BsaJ_I", "C^CNNGG"], ["BsaM_I", "GAATGCN^"], ["BsaO_I", "CGRY^CG"], ["BsaW_I", "W^CCGGW"], ["BsaX_I", "(N)3(N)9AC(N)5CTCC(N)10^"], ["Bsc_I", "AT^CGAT"], ["Bsc4_I", "CC(N)5^NNGG"], ["Bse1_I", "ACTGGN^"], ["Bse3D_I", "GCAATGNN^"], ["Bse8_I", "GATNN^NNATC"], ["Bse21_I", "CC^TNHGG"], ["Bse118_I", "R^CCGGY"], ["BseA_I", "T^CCGGA"], ["BseB_I", "CCWGG"], ["BseC_I", "AT^CGAT"], ["BseD_I", "C^CNNGG"], ["BseG_I", "GGATGNN^"], ["BseJ_I", "GATNN^NNATC"], ["BseL_I", "CC(N)5^NNGG"], ["BseM_I", "GCAATGNN^"], ["BseM_II", "CTCHG(N)10^"], ["BseN_I", "ACTGGN^"], ["BseP_I", "G^CGCGC"], ["BseR_I", "GHGGHG(N)10^"], ["BseS_I", "GKGCM^C"], ["BseX_I", "GCHGC(8/12)"], ["BseX3_I", "C^GGCCG"], ["BseY_I", "C^CCAGC"], ["Bsg_I", "GTGCHG(N)16^"], ["Bsh1236_I", "CG^CG"], ["Bsh1285_I", "CGRY^CG"], ["Bsh1365_I", "GATNN^NNATC"], ["BshF_I", "GG^CC"], ["BshN_I", "G^GYRCC"], ["BshT_I", "A^CCGGT"], ["BsiB_I", "GATNN^NNATC"], ["BsiE_I", "CGRY^CG"], ["BsiHKA_I", "GWGCW^C"], ["BsiHKC_I", "C^YCGRG"], ["BsiM_I", "T^CCGGA"], ["BsiS_I", "C^CGG"], ["BsiW_I", "C^GTSCG"], ["BsiY_I", "CC(N)5^NNGG"], ["BsiZ_I", "AT^CGAT"], ["Bsl_I", "CC(N)5^NNGG"], ["BslF_I", "^(N)11GGGAC(N)10^"], ["Bsm_I", "GAATGCN^"], ["BsmA_I", "GTCTCN^"], ["BsmB_I", "CGTCTCN^"], ["BsmF_I", "GGGAC(N)10^"], ["Bso31_I", "GGTCTCN^NNNN"], ["BsoB_I", "C^YCGRG"], ["BsoMA_I", "GTCTCN^(N)4"], ["Bsp13_I", "T^CCGGA"], ["Bsp19_I", "C^CATGG"], ["Bsp68_I", "TCG^CGA"], ["Bsp106_I", "AT^CGAT"], ["Bsp119_I", "TT^CGAA"], ["Bsp120_I", "G^GGCCC"], ["Bsp143_I", "^GATC"], ["Bsp143_II", "RGCGC^Y"], ["Bsp1286_I", "GDGCH^C"], ["Bsp1407_I", "T^GTACA"], ["Bsp1720_I", "GC^TNAGC"], ["BspAN_I", "GG^CC"], ["BspC_I", "CGAT^CG"], ["BspCN_I", "CTCAG(N)9^"], ["BspD_I", "AT^CGAT"], ["BspE_I", "T^CCGGA"], ["BspH_I", "T^CATGA"], ["BspL_I", "GGN^NCC"], ["BspLU11_I", "A^CATGT"], ["BspM_I", "ACCTGC(N)4^"], ["BspM_I", "CTGCA^G"], ["BspP_I", "GGATC(N)4^"], ["BspT_I", "C^TTAAG"], ["BspT104_I", "TT^CGAA"], ["BspT107_I", "G^GYRCC"], ["BspTN_I", "GGTCTN^NNNN"], ["BspX_I", "AT^CGAT"], ["Bsr_I", "ACTGGN^"], ["BsrB_I", "CCG^CTC"], ["BsrD_I", "GCAATGNN^"], ["BsrF_I", "R^CCGGY"], ["BsrG_I", "T^GTACA"], ["BsrS_I", "ACTGGN^"], ["BssA_I", "R^CCGGY"], ["BssEC_I", "C^CNNGG"], ["BssH_I", "C^TCGAG"], ["BssH_II", "G^CGCGC"], ["BssK_I", "^CCNGG"], ["BssNA_I", "GTS^TSC"], ["BssS_I", "C^ACGHG"], ["BssT1_I", "C^CWWGG"], ["Bst2B_I", "C^ACGHG"], ["Bst2U_I", "CC^WGG"], ["Bst4C_I", "CAN^GT"], ["Bst6_I", "CTCTTCN^NNN"], ["Bst7l_I", "GCHGC(N)8^"], ["Bst98_I", "C^TTSHG"], ["Bst1107_I", "GTS^TSC"], ["BstAC_I", "GR^CGYC"], ["BstAP_I", "GCA(N)4^NTGC"], ["BstAU_I", "T^GTACA"], ["BstB_I", "TT^CGAA"], ["BstBA_I", "YAC^GTR"], ["BstC8_I", "GCN^NGC"], ["BstDE_I", "C^TNHG"], ["BstDS_I", "C^CRYGG"], ["BstE_II", "G^GTNACC"], ["BstEN_I", "CCTNN^NNNAGG"], ["BstEN_II", "^GATC"], ["BstF5_I", "GGATGNN^"], ["BstFN_I", "CG^CG"], ["BstH2_I", "RGCGC^Y"], ["BstHH_I", "GCG^C"], ["BstHP_I", "GTT^AAC"], ["BstKT_I", "GAT^C"], ["BstMA_I", "CTGCA^G"], ["BstMB_I", "^GATC"], ["BstMC_I", "CGRY^CG"], ["BstMW_I", "GCNNNNN^NNGC"], ["BstN_I", "CC^WGG"], ["BstNS_I", "RCATG^Y"], ["BstO_I", "CC^WGG"], ["BstP_I", "G^GTNACC"], ["BstPA_I", "GACNN^NNGTC"], ["BstSC_I", "^CCNGG"], ["BstSF_I", "C^TRYHG"], ["BstSN_I", "TSC^GTS"], ["BstU_I", "CG^CG"], ["BstV1_I", "GCAGC(N)8^(N)4"], ["BstV2_I", "GAAGACNN^NNNN"], ["BstX_I", "CCA(N)5^NTGG"], ["BstX2_I", "R^GATCY"], ["BstY_I", "R^GATCY"], ["BstZ_I", "C^GGCCG"], ["BstZ17_I", "GTS^TSC"], ["Bsu15_I", "AT^CGAT"], ["Bsu36_I", "CC^TNHGG"], ["BsuR_I", "GG^CC"], ["BsuTU_I", "AT^CGAT"], ["Btg_I", "CCR^YGG"], ["Btr_I", "CAC^GTC"], ["Bts_I", "GCHGTGNN^"], ["Bve_I", "ACCTGC(N)4^(N)4"], ["Type_II", "Restriction"], ["Enzymes_Sequence", "Restriction"], ["Cac8_I", "GCN^NGC"], ["Cai_I", "CHGNNN^CTG"], ["CciN_I", "GC^GGCCGC"], ["Cel_II", "GC^TNHGC"], ["Cfo_I", "GCG^C"], ["Cfr_I", "Y^GGCCR"], ["Cfr9_I", "C^CCGGG"], ["Cfr10_I", "R^CCGGY"], ["Cfr13_I", "G^GNCC"], ["Cfr42_I", "CCGC^GG"], ["Cla_I", "AT^CGAT"], ["Cpo_I", "CGGWC^CG"], ["Csp_I", "CG^GWCCG"], ["Csp6_I", "G^TSC"], ["Csp45_I", "TT^CGAA"], ["CspA_I", "A^CCGGT"], ["CviA_II", "C^ATG"], ["CviJ_I", "RG^CY"], ["CviR_I", "TG^CA"], ["CviT_I", "RG^CY"], ["Cvn_I", "CC^TNHGG"], ["Dde_I", "C^TNHG"], ["Dpn_I", "GA^TC"], ["Dpn_II", "^GATC"], ["Dra_I", "TTT^AAA"], ["Dra_II", "RG^GNCCY"], ["Dra_III", "CACNNN^GTG"], ["Drd_I", "GAC(N)4^NNGTC"], ["DseD_I", "GAC(N)4^NNGTC"], ["Eae_I", "Y^GGCCR"], ["Eag_I", "C^GGCCG"], ["Eam1104_I", "CTCTTCN^"], ["Eam1105_I", "GACNNN^NNGTC"], ["Ear_I", "CTCTTCN^"], ["Eci_I", "GGCGGA(N)11^"], ["Ecl136_II", "GHG^CTC"], ["EclHK_I", "GACNNN^NNGTC"], ["EclX_I", "C^GGCCG"], ["Eco24_I", "GRGCY^C"], ["Eco31_I", "GGTCTCN^"], ["Eco32_I", "GAT^ATC"], ["Eco47_I", "G^GWCC"], ["Eco47_III", "HGC^GCT"], ["Eco52_I", "C^GGCCG"], ["Eco57_I", "CTGAAG(N)16^"], ["Eco57M_I", "CTGRAG16^"], ["Eco72_I", "CAC^GTG"], ["Eco81_I", "CC^THGG"], ["Eco88_I", "C^YCGRG"], ["Eco91_I", "G^GTNACC"], ["Eco105_I", "TSC^GTS"], ["Eco130_I", "C^CWWGG"], ["Eco147_I", "HGG^CCT"], ["EcoICR_I", "GHG^CTC"], ["EcoN_I", "CCTNN^NNNHGG"], ["EcoO65_I", "G^GTNACC"], ["EcoO109_I", "RG^GNCCY"], ["EcoR_I", "G^AATTC"], ["EcoR_II", "^CCWGG"], ["EcoR_V", "GAT^ATC"], ["EcoT14_I", "C^CWWGG"], ["EcoT22_I", "ATGCA^T"], ["EcoT38_I", "GRGCY^C"], ["Ege_I", "GGC^GCC"], ["Ehe_I", "GGC^GCC"], ["Erh_I", "C^CWWGG"], ["Esp3_I", "CGTCTCN^"], ["Fal_I", "(N)5^(N)8AAG(N)5CTT(N)13^"], ["Fat_I", "^CATG"], ["Fau_I", "CCCGC(N)4^"], ["FauND_I", "CA^TSTG"], ["Fba_I", "T^GATCA"], ["Fbl_I", "GT^MKAC"], ["Fnu4H_I", "GC^NGC"], ["Fok_I", "GGATG(N)9^"], ["FriO_I", "GRGCY^C"], ["Fse_I", "GGCCGG^CC"], ["Fsp_I", "TGC^GCA"], ["Fsp4H_I", "GC^NGC"], ["FspA_I", "RTGC^GCAY"], ["Fun_I", "AGC^GCT"], ["Fun_II", "G^AATTC"], ["Gsu_I", "CTGGHG(N)16^"], ["Hae_II", "RGCGC^Y"], ["Hae_III", "GG^CC"], ["Hap_II", "C^CGG"], ["Hga_I", "GACGC(N)5^"], ["Hha_I", "GCG^C"], ["Hin1_I", "GR^CGYC"], ["Hin4_I", "(N)5^(N)8GAY(N)5VTC(N)13^"], ["Hin6_I", "G^CGC"], ["Hinc_II", "GTY^RAC"], ["Hind_II", "GTY^YAC"], ["Hind_III", "A^HGCTT"], ["Hinf_I", "G^ANTC"], ["HinP1_I", "G^CGC"], ["Hpa_I", "GTT^AAC"], ["Hpa_II", "C^CGG"], ["Hph_I", "GGTGA(N)8^"], ["Hpy8_I", "GTN^NAC"], ["Hpy99_I", "CGWCG^"], ["Hpy188_I", "TCN^GA"], ["Hpy188_III", "TC^NNGA"], ["HpyCH4_III", "ACN^GT"], ["HpyCH4_IV", "A^CGT"], ["HpyCH4_V", "TG^CA"], ["HpyF10_VI", "GC(N)6^NGC"], ["Hsp92_I", "GR^CGYC"], ["Hsp92_II", "CATG^"], ["HspA_I", "G^CGC"], ["Ita_I", "GC^NGC"], ["Kas_I", "G^GCGCC"], ["Kpn_I", "GGTSC^C"], ["Kpn2_I", "T^CCGGA"], ["Ksp_I", "CCGC^GG"], ["Ksp22_I", "T^GATCA"], ["Ksp632_I", "CTCTTCN^"], ["KspA_I", "GTT^AAC"], ["Kzo9_I", "^GATC"], ["Lsp_I", "TT^CGAA"], ["Lwe_I", "GCATC(N)5^(N)4"], ["Mab_I", "A^CCWGGT"], ["Mae_I", "C^TAG"], ["Mae_II", "A^CGT"], ["Mae_III", "^GTNAC"], ["Mam_I", "GATNN^NNATC"], ["Mbi_I", "CCG^CTC"], ["Mbo_I", "^GATC"], ["Mbo_II", "GAHGA(N)8^"], ["Mfe_I", "C^AATTG"], ["Mfl_I", "R^GATCY"], ["Mhl_I", "GDGCH^C"], ["Mls_I", "TGG^CCA"], ["Mlu_I", "A^CGCGT"], ["MluN_I", "TGG^CCA"], ["Mly_I", "GHGTCN5^"], ["Mly113_I", "GG^CGCC"], ["Mme_I", "TCCRAC(N)20^"], ["Mnl_I", "CCTC(N)7^"], ["Mph1103_I", "ATGCA^T"], ["Mro_I", "T^CCGGA"], ["MroN_I", "G^CCGGC"], ["MroX_I", "GAANN^NNTTC"], ["Msc_I", "TGG^CCA"], ["Mse_I", "T^TAA"], ["Msl_I", "CAY(N)4RTG"], ["Msp_I", "C^CGG"], ["Msp17_I", "GR^CGYC"], ["Msp20_I", "TGG^CCA"], ["MspA1_I", "CMG^CKG"], ["MspC_I", "C^TTSHG"], ["MspR9_I", "CC^NGG"], ["Mss_I", "GTTT^AAAC"], ["Mun_I", "C^AATTG"], ["Mva_I", "CC^WGG"], ["Mva1269_I", "GAATGCN^"], ["Mvn_I", "CG^CG"], ["Mwo_I", "GC(N)5^NNGC"], ["Nae_I", "GCC^GGC"], ["Nar_I", "GG^CGCC"], ["Nci_I", "CC^SGG"], ["Nco_I", "C^CATGG"], ["Nde_I", "CA^TSTG"], ["Nde_II", "^GATC"], ["NgoM_IV", "G^CCGGC"], ["Nhe_I", "G^CTHGC"], ["Nla_III", "CATG^"], ["Nla_IV", "GGN^NCC"], ["NmuC_I", "^GTSAC"], ["Not_I", "GC^GGCCGC"], ["Nru_I", "TCG^CGA"], ["NruG_I", "GACNNN^NNGTC"], ["Nsb_I", "TGC^GCA"], ["Nsi_I", "ATGCA^T"], ["Nsp_I", "RCATG^Y"], ["Nsp_III", "C^YCGRG"], ["Nsp_V", "TT^CGAA"], ["Oli_I", "CACNN^NNGTG"], ["Pac_I", "TTSAT^TSA"], ["Pae_I", "GCATG^C"], ["PaeR7_I", "C^TCGHG"], ["Pag_I", "T^CATGA"], ["Pal_I", "GG^CC"], ["Pau_I", "G^CGCGC"], ["Pce_I", "AGG^CCT"], ["Pci_I", "A^CATGT"], ["Pct_I", "GAATGCN^"], ["Pdi_I", "GCC^GGC"], ["Pdm_I", "GAANN^NNTTC"], ["Pfl23_II", "C^GTSCG"], ["PflB_I", "CCANNNN^NTGG"], ["PflF_I", "GACN^NNGTC"], ["PflM_I", "CCA(N)4^NTGG"], ["Pfo_I", "T^CCNGGA"], ["Pho_I", "GG^CC"], ["PinA_I", "A^CCGGT"], ["Ple_I", "GHGTC(N)4^"], ["Ple19_I", "CGAT^CG"], ["PmaC_I", "CAC^GTG"], ["Pme_I", "GTTT^AAAC"], ["Pml_I", "CAC^GTG"], ["Ppi_I", "(N)5^(N)7GAAC(N)5CTC(N)13^"], ["Pps_I", "GHGTC(N)4^"], ["Ppu10_I", "A^TGCAT"], ["PpuM_I", "RG^GWCCY"], ["PpuX_I", "RG^GWCCY"], ["PshA_I", "GACNN^NNGTC"], ["PshB_I", "AT^TSAT"], ["Psi_I", "TTS^TSA"], ["Psp5_II", "RG^GWCCY"], ["Psp6_I", "^CCWGG"], ["Psp124B_I", "GAGCT^C"], ["Psp1406_I", "AA^CGTT"], ["PspA_I", "C^CCGGG"], ["PspE_I", "G^GTNACC"], ["PspG_I", "^CCWGG"], ["PspL_I", "C^GTSCG"], ["PspN4_I", "GGN^NCC"], ["PspOM_I", "G^GGCCC"], ["PspP_I", "G^GNCC"], ["PspPP_I", "RG^GWCCY"], ["PspX_I", "VC^TCGAGB"], ["Psr_I", "(N)5^(N)7GAAC(N)6TAC(N)12^"], ["Pst_I", "CTGCA^G"], ["Psu_I", "R^GATCY"], ["Psy_I", "GACN^NNGTC"], ["Pvu_I", "CGAT^CG"], ["Pvu_II", "CHG^CTG"], ["Rca_I", "T^CATGA"], ["Rsa_I", "GT^AC"], ["Rsr_II", "CG^GWCCG"], ["Rsr2_I", "CG^GWCCG"], ["Sac_I", "GHGCT^C"], ["Sac_II", "CCGC^GG"], ["Sal_I", "G^TCGAC"], ["SanD_I", "GG^GWCCC"], ["Sap_I", "GCTCTTCN^"], ["Sat_I", "GC^NGC"], ["Sau3A_I", "^GATC"], ["Sau96_I", "G^GNCC"], ["Sbf_I", "CCTGCA^GG"], ["Sca_I", "HGT^ACT"], ["Sch_I", "GHGTC(N)5^"], ["ScrF_I", "CC^NGG"], ["Sda_I", "CCTGCA^GG"], ["Sdu_I", "GDGCH^C"], ["SexA_I", "A^CCWGGT"], ["SfaN_I", "GCATC(N)5^"], ["Sfc_I", "C^TRYHG"], ["Sfi_I", "GGCC(N)4^NGGCC"], ["Sfo_I", "GGC^GCC"], ["Sfr274_I", "C^TCGHG"], ["Sfr303_I", "CCGC^GG"], ["Sfu_I", "TT^CGAA"], ["Sgf_I", "GCGAT^CGC"], ["SgrA_I", "CR^CCGGYG"], ["SgrB_I", "CCGC^GG"], ["Sin_I", "G^GWCC"], ["Sla_I", "CTCGAG"], ["Sma_I", "CCC^GGG"], ["Smi_I", "ATTT^AAAT"], ["SmiM_I", "CAYNN^NNRTG"], ["Sml_I", "C^TYRAG"], ["Smu_I", "CCCGCNNNN^NN"], ["SnaB_I", "TSC^GTS"], ["SpaH_I", "GCATG^C"], ["Spe_I", "A^CTHGT"], ["Sph_I", "GCATG^C"], ["Srf_I", "GCCC^GGGC"], ["Sse9_I", "^AATT"], ["Sse8387_I", "CCTGCA^GG"], ["SseB_I", "HGG^CCT"], ["Ssi_I", "CC^GC"], ["Ssp_I", "AAT^ATT"], ["SspB_I", "T^GTSCA"], ["Sst_I", "GHGCT^C"], ["Sst_II", "CCGC^GG"], ["Stu_I", "AGG^CCT"], ["Sty_I", "C^CWWGG"], ["StyD4_I", "^CCNGG"], ["Sun_I", "C^GTSCG"], ["Swa_I", "ATTT^AAAT"], ["Taa_I", "ACN^GT"], ["Tai_I", "ACGT^"], ["Taq_I", "T^CGA"], ["Taq_II", "GACCGA(N)11^"], ["Tas_I", "^AATT"], ["Tat_I", "W^GTACW"], ["Tau_I", "GCSG^C"], ["Tel_I", "GACN^NNGTC"], ["Tfi_I", "G^AWTC"], ["Tha_I", "CG^CG"], ["Tli_I", "CTCGAG"], ["Tru1_I", "T^TAA"], ["Tru9_I", "T^TAA"], ["Tsc_I", "ACGT^"], ["Tse_I", "G^CWGC"], ["Tsp45_I", "^GTSAC"], ["Tsp509_I", "^AATT"], ["TspDT_I", "ATGAA(N)11^"], ["TspE_I", "^AATT"], ["TspGW_I", "ACGGA(N)11^"], ["TspR_I", "NNCASTGNN^"], ["Tth111_I", "GACN^NNGTC"], ["TthHB8_I", "T^CGA"], ["Van91_I", "CCA(N)4^NTGG"], ["Vha464_I", "C^TTAAG"], ["Vne_I", "G^TGCAC"], ["VpaK11B_I", "G^GWCC"], ["Vsp_I", "AT^TAAT"], ["Xag_I", "CCTNN^NNNAGG"], ["Xap_I", "R^AATTY"], ["Xba_I", "T^CTAGA"], ["Xce_I", "RCATG^Y"], ["Xcm_I", "CCA(N)5^(N)4TGG"], ["Xho_I", "C^TCGAG"], ["Xho_II", "R^GATCY"], ["Xma_I", "C^CCGGG"], ["Xma_III", "C^GGCCG"], ["XmaC_I", "C^CCGGG"], ["XmaJ_I", "C^CTAGG"], ["Xmi_I", "GT^MKAC"], ["Xmn_I", "GAANN^NNTTC"], ["Xsp_I", "C^TAG"], ["Zho_I", "AT^CGAT"], ["Zra_I", "GAC^GTC"], ["Zsp2_I", "ATGCA^T"]}; procedure actual_data(rec_triple); -- read a record using its triple procedure cds_start_and_translation(stg); -- try translating a sequence in all three frames; return bet piece procedure tom_in_protein(stg); -- find first start codon in DNA translation procedure align_by_mers(stg1,stg2,mer_size); -- alignment of two strings by the common mer method. procedure make_random_dna(n); -- make random sequences of bases of given length procedure histo_dna(tup,name); -- histogramming procedure prepare_dna(stg); -- remove junk from fasta record procedure translate_dna(stg); -- translate dna to protein procedure rev_comp_dna(stg); -- reverse and complement bases in DNA string procedure reverse_dna(stg); -- reverse DNA string procedure complementary_dna(stg); -- complement bases in DNA string procedure peek(file_name,strt,n); -- peek at start of specified file procedure survey_ncbi_active_list(file_name); -- initial survey of NCBI active list procedure index_ncbi_active_list(file_name); -- index the NCBI active list, producing and writing an index flat procedure gb_record(gb_record_name); -- fetch a genbank record by its name procedure get_by_gi(gi_no); -- get a genbank record by its genbank gi number procedure data_blocks_in(lines); -- extract the data blocks from a Genbank record procedure ncbi_hash(stg); -- hash an ncbi GB name procedure cut_by_enzymes(flat_stg,enz_name_list); -- return the list of pieces into which a string of bases -- is cut by a specified list of restriction enzymes, -- applied in the specified order, return triples [enzA,substg,enzB] procedure group_strings_by_alignment(tup_of_stgs,tags,mer_size); -- find closely matched subgroups in a list of strings -- *************************** Unigene Analysis Codes *************************** procedure filter_psl(file_name); -- filter one of Toto's .psl output files -- procedure build_examine_unigene_indices(); -- build indices of unigene files and then check them (moved to test program at end) procedure merge_psl_summaries(file_name_list,out_name); -- merge a list of psl summary files procedure build_unigene_index(list_of_unigene_filenames,saved_index_name); -- prepare and save a unigene file index procedure tag_and_index_ug_files(file_name_tup); -- prepare a unigene file index also showing the starting position of the cds -- this routine can prepare a single index for multiple files -- Note that this routine returns an index object, leaving it -- to the calling routine (see build_unigene_indices) to save the index object procedure triples_in_several(input_file); -- look for cross-species triples; for use with remotely related species procedure exon_length_compare_histogram(input_file1,input_file2); -- histogram of exon lengths procedure histo_lengths(input_file); -- build histogram of interior exon lengths procedure triple_occurences(input_file,rix_file); -- look for multiply occuring triples and assemble annotated groups procedure show_moduli(thist_line); -- transform a .thist line to show exon moduli and start of reading frame procedure find_genbank_id(rec_start,rec_len,file_handle); -- read a genbank id from the header line end genetic_code_pak; package body genetic_code_pak; -- collection of genetic code utilities use tkw,string_utility_pak,sort_pak,random_pak,rix_flat_pak,get_lines_pak; -- use rix_flat_pak to get the gene annotations or data const allowed_pept_block_charsA := "AGVLIPFMWCNQSTVRKHDEY "; -- allowed peptide charcters const allowed_pept_block_charsB := "agvlipfmwcnqstvrkhdey "; const allowed_prefix_chars := "0123456789 "; const DNA_strand := 9,q_start := 12,q_end := 13; var ihandle,ohandle_1,ohandle_2,ohandle_n,badhandle,start_line := OM,histo_exon_count,sec_count; var hv_debug,lines,lines_with_trips,ohandle; var rix_obj; -- index prepared for access to Unigene information var alignment_count := 0; -- for tracking progress ith alignments procedure get_by_gi(gi_no); -- get a record by its genbank gi number -- print("fetching url: ",url := "ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=" + gi_no); return http_get(url); end get_by_gi; procedure http_get(url); -- get url using tkw socket Tk ?:= tkw(); -- open the main TK window if necessary url_host := break(url,"/"); sock := Tk("socket",["www." + url_host + ":80","text"]); -- open socket communication with the HTML host lines := []; sock(OM) := "GET " + url; while (resp := sock(OM)) /= "