Example Scripts¶
This is a collection of example scripts for some of Ursgal’s functionality. The simple example scripts are designed to get familiar with executing engines through Ursgal. Examles of complete workflows can be modified for different needs. Further example scripts can also be found in ~ursgalexample_scripts
Simple Example Scripts¶
Simple example search¶
-
simple_example_search.
main
()¶ Executes a search with OMSSA, XTandem and MS-GF+ on the BSA1.mzML input_file
- usage:
- ./simple_example_search.py
Note
Myrimatch does not work with this file. To use MSAmanda on unix platforms, please install mono (http://www.mono-project.com/download)
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import sys
import shutil
def main():
"""
Executes a search with OMSSA, XTandem and MS-GF+ on the BSA1.mzML
input_file
usage:
./simple_example_search.py
Note:
Myrimatch does not work with this file.
To use MSAmanda on unix platforms, please install mono
(http://www.mono-project.com/download)
"""
uc = ursgal.UController(
profile="LTQ XL low res",
params={
"database": os.path.join(os.pardir, "example_data", "BSA.fasta"),
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
# 'peptide_mapper_class_version' : 'UPeptideMapper_v2',
},
)
if sys.maxsize > 2 ** 32:
xtandem = "xtandem_vengeance"
else:
xtandem = "xtandem_sledgehammer"
engine_list = [
"omssa",
xtandem,
"msgfplus_v2016_09_16",
]
mzML_file = os.path.join(
os.pardir, "example_data", "BSA_simple_example_search", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
uc.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
uc.params["http_output_folder"] = os.path.dirname(mzML_file)
uc.fetch_file(
engine="get_http_files_1_0_0",
)
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
unified_file_list = []
for engine in engine_list:
unified_search_result_file = uc.search(
input_file=mzML_file, engine=engine, force=False
)
unified_file_list.append(unified_search_result_file)
uc.visualize(
input_files=unified_file_list,
engine="venndiagram_1_1_0",
)
return
if __name__ == "__main__":
main()
Simple example using combined fdr (or pep)¶
-
simple_combined_fdr_score.
main
()¶ Executes a search with 3 different search engines on an example file from the data from Barth et al. (The same file that is used in the XTandem version comparison example.)
- usage:
- ./simple_combined_fdr_score.py
This is a simple example script to show how results from multiple search engines can be combined using the Combined FDR Score approach of Jones et al. (2009).
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Executes a search with 3 different search engines on an example file from the
data from Barth et al. (The same file that is used in the XTandem version
comparison example.)
usage:
./simple_combined_fdr_score.py
This is a simple example script to show how results from multiple search engines
can be combined using the Combined FDR Score approach of Jones et al. (2009).
"""
engine_list = [
"omssa_2_1_9",
"xtandem_piledriver",
# 'myrimatch_2_1_138',
"msgfplus_v9979",
]
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"modifications": [],
"csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]],
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
],
"ftp_output_folder": os.path.join(
os.pardir, "example_data", "xtandem_version_comparison"
),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
if os.path.exists(params["ftp_output_folder"]) is False:
os.mkdir(params["ftp_output_folder"])
uc = ursgal.UController(profile="LTQ XL low res", params=params)
mzML_file = os.path.join(params["ftp_output_folder"], params["ftp_include_ext"][0])
if os.path.exists(mzML_file) is False:
uc.fetch_file(engine="get_ftp_files_1_0_0")
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
validated_files_list = []
for engine in engine_list:
unified_result_file = uc.search(
input_file=mzML_file,
engine=engine,
)
validated_file = uc.validate(
input_file=unified_result_file,
engine="percolator_2_08",
)
validated_files_list.append(validated_file)
combined_results = uc.combine_search_results(
input_files=validated_files_list,
engine="combine_FDR_0_1",
# use combine_pep_1_0_0 for combined PEP :)
)
print("\tCombined results can be found here:")
print(combined_results)
return
if __name__ == "__main__":
main()
Simple de novo search¶
-
simple_de_novo_search.
main
()¶ Executes a search with Novor and PepNovo on the BSA1.mzML input_file
- usage:
- ./simple_de_novo_search.py
Note
PepNovo currently only works for Linux. Novor needs to be downloaded from http://rapidnovor.com/novor/standalone/ and stored at <ursgal_path>/resources/<platform>/<architecture>/novor_1_1beta
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import sys
import shutil
def main():
"""
Executes a search with Novor and PepNovo on the BSA1.mzML
input_file
usage:
./simple_de_novo_search.py
Note:
PepNovo currently only works for Linux.
Novor needs to be downloaded from http://rapidnovor.com/novor/standalone/ and
stored at <ursgal_path>/resources/<platform>/<architecture>/novor_1_1beta
"""
uc = ursgal.UController(
profile="LTQ XL low res",
params={
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
},
)
engine_list = [
"novor_1_05",
"pepnovo",
]
mzML_file = os.path.join(
os.pardir, "example_data", "BSA_simple_de_novo_search", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
uc.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
uc.params["http_output_folder"] = os.path.dirname(mzML_file)
uc.fetch_file(
engine="get_http_files_1_0_0",
)
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
unified_file_list = []
for engine in engine_list:
unified_search_result_file = uc.search(
input_file=mzML_file, engine=engine, force=False
)
unified_file_list.append(unified_search_result_file)
uc.visualize(
input_files=unified_file_list,
engine="venndiagram_1_1_0",
)
return
if __name__ == "__main__":
main()
Simple crosslink search¶
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import sys
def main():
"""
Simple crosslink search using Kojak and an example file from Barth et al.
2014, please note that this is only testing if Kojak works.
The sample represents no crosslink data in particular, but Kojak is used to
map possible disulfide bonds.
Parameters have not been optimized yet, please use this script as a template
to use Kojak. Please note the different approach for executing Percolator.
Note:
Please note that Kojak has to installed manually at the resources folder
under engine-folder (kojak_1_5_3).
Additinally Percolator (2.08) has to be symlinked or copied to
engine-folder 'kojak_percolator_2_08'.
Usage:
./simple_srosslink_search.py <path_2_chlamydomonas_reinhardtii_database>
Note:
The peptide -.GHYLNATAGTC[-34.00]EEMMK.- from Rubisco large subunit is
detected with a predicted disulfide bond site at this position.
This site is reported to have a disulfide bond in the Uniprot database.
The modification (C +32, C -34) was set according to 'Tsai PL, Chen SF,
Huang SY (2013) Mass spectrometry-based strategies for protein disulfide
bond identification. Rev Anal Chem 32: 257–268'
Please use the reference C. reinhardtii (TaxId 3055 ) proteome from
Unipot.
"""
params = {
"database": sys.argv[1],
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
],
"ftp_output_folder": os.path.join(
os.pardir, "example_data", "simple_crosslink_search"
),
"cross_link_definition": ["C C -2 test_if_kojak_runs"],
"mono_link_definition": ["C 32", "C -34"],
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
],
"precursor_min_mass": 500,
"precursor_max_mass": 8000,
"precursor_mass_tolerance_plus": 15,
"precursor_mass_tolerance_minus": 15,
"max_accounted_observed_peaks": 0, # i.e. all
"max_num_mods": 2,
}
if os.path.exists(params["ftp_output_folder"]) is False:
os.mkdir(params["ftp_output_folder"])
uc = ursgal.UController(profile="LTQ XL low res", params=params)
mzML_file = os.path.join(params["ftp_output_folder"], params["ftp_include_ext"][0])
if os.path.exists(mzML_file) is False:
uc.fetch_file(engine="get_ftp_files_1_0_0")
td_database_name = params["database"].replace(".fasta", "_target_decoy.fasta")
if os.path.exists(td_database_name) is False:
uc.execute_misc_engine(
input_files=[params["database"]],
engine="generate_target_decoy_1_0_0",
output_file_name=td_database_name,
)
uc.params["database"] = td_database_name
engine = "kojak_1_5_3"
mgf_file = uc.convert(
input_file=mzML_file,
engine="mzml2mgf",
)
raw_result = uc.search_mgf(
input_file=mgf_file,
engine=engine,
force=False,
)
for extension in uc.unodes[engine]["META_INFO"]["all_extensions"]:
if "perc" not in extension:
continue
file_to_validate = raw_result.replace(
uc.unodes[engine]["META_INFO"]["output_extension"], extension
)
try:
uc.validate(file_to_validate, "kojak_percolator_2_08")
except:
pass
return
if __name__ == "__main__":
main()
Target decoy generation¶
-
target_decoy_generation_example.
main
()¶ Simple example script how to generate a target decoy database.
Note
By default a ‘shuffled peptide preserving cleavage sites’ database is generated. For this script a ‘reverse protein’ database is generated.
usage:
./target_decoy_generation_example.py
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Simple example script how to generate a target decoy database.
Note:
By default a 'shuffled peptide preserving cleavage sites' database is
generated. For this script a 'reverse protein' database is generated.
usage:
./target_decoy_generation_example.py
"""
params = {
"enzyme": "trypsin",
"decoy_generation_mode": "reverse_protein",
}
fasta_database_list = [os.path.join(os.pardir, "example_data", "BSA.fasta")]
uc = ursgal.UController(params=params)
new_target_decoy_db_name = uc.execute_misc_engine(
input_file=fasta_database_list,
engine="generate_target_decoy_1_0_0",
output_file_name="my_BSA_target_decoy.fasta",
)
print("Generated target decoy database: {0}".format(new_target_decoy_db_name))
if __name__ == "__main__":
main()
Convert .raw to .mzML¶
-
convert_raw_to_mzml.
main
(input_path=None)¶ Convert a .raw file to .mzML using the ThermoRawFileParser. The given argument can be either a single file or a folder containing raw files.
- Usage:
- ./convert_raw_to_mzml.py <raw_file/raw_file_folder>
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import sys
import glob
def main(input_path=None):
"""
Convert a .raw file to .mzML using the ThermoRawFileParser.
The given argument can be either a single file or a folder
containing raw files.
Usage:
./convert_raw_to_mzml.py <raw_file/raw_file_folder>
"""
R = ursgal.UController()
# Check if single file or folder.
# Collect raw files if folder is given
input_file_list = []
if input_path.lower().endswith(".raw"):
input_file_list.append(input_path)
else:
for raw in glob.glob(os.path.join("{0}".format(input_path), "*.raw")):
input_file_list.append(raw)
# Convert raw file(s)
for raw_file in input_file_list:
mzml_file = R.convert(
input_file=raw_file,
engine="thermo_raw_file_parser_1_1_2",
)
if __name__ == "__main__":
if len(sys.argv) != 2:
print(main.__doc__)
main(input_path=sys.argv[1])
Simple mgf conversion¶
-
simple_mgf_conversion.
main
()¶ Simple example script to demonstrate conversion for mzML to mgf file conversion
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import shutil
def main():
"""
Simple example script to demonstrate conversion for mzML to mgf file
conversion
"""
uc = ursgal.UController(
profile="LTQ XL low res",
params={},
)
mzML_file = os.path.join(
os.pardir, "example_data", "simple_mzml_to_mgf_conversion", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
uc.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
uc.params["http_output_folder"] = os.path.dirname(mzML_file)
uc.fetch_file(engine="get_http_files_1_0_0")
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
mgf_file = uc.convert(
input_file=mzML_file, engine="mzml2mgf_1_0_0" # from OpenMS example files
)
if __name__ == "__main__":
print(__doc__)
main()
Mgf conversion loop examples¶
-
mgf_conversion_inside_and_outside_loop.
main
()¶
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import shutil
def main():
""""""
R = ursgal.UController(
profile="LTQ XL low res",
params={
"database": os.path.join(os.pardir, "example_data", "BSA.fasta"),
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation[]
],
},
)
engine = "omssa"
output_files = []
mzML_file = os.path.join(
os.pardir, "example_data", "mgf_conversion_example", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
R.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
R.params["http_output_folder"] = os.path.dirname(mzML_file)
R.fetch_file(engine="get_http_files_1_0_0")
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
# First method: Convert to MGF outside of the loop:
# (saves some time cause the MGF conversion is not always re-run)
mgf_file = R.convert(
input_file=mzML_file, engine="mzml2mgf_1_0_0" # from OpenMS example files
)
for prefix in ["10ppm", "20ppm"]:
R.params["prefix"] = prefix
output_file = R.search(
input_file=mgf_file,
engine=engine,
# output_file_name = 'some_userdefined_name'
)
# Second method: Automatically convert to MGF inside the loop:
# (MGF conversion is re-run every time because the prexix changed!)
for prefix in ["5ppm", "15ppm"]:
R.params["prefix"] = prefix
output_file = R.search(
input_file=mzML_file, # from OpenMS example files
engine=engine,
# output_file_name = 'another_fname',
)
output_files.append(output_file)
print("\tOutput files:")
for f in output_files:
print(f)
if __name__ == "__main__":
print(__doc__)
main()
Simple Venn diagram¶
-
simple_venn_example.
main
()¶ Example for plotting a simple Venn diagram with single ursgal csv files.
- usage:
- ./simple_venn_example.py
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Example for plotting a simple Venn diagram with single ursgal csv files.
usage:
./simple_venn_example.py
"""
uc = ursgal.UController(
profile="LTQ XL low res",
params={"visualization_label_positions": {"0": "omssa", "1": "xtandem"}},
)
file_list = [
os.path.join(
os.pardir, "tests", "data", "omssa_2_1_9", "test_BSA1_omssa_2_1_9.csv"
),
os.path.join(
os.pardir,
"tests",
"data",
"xtandem_sledgehammer",
"test_BSA1_xtandem_sledgehammer.csv",
),
]
uc.visualize(
input_files=file_list,
engine="venndiagram_1_1_0",
force=True,
)
return
if __name__ == "__main__":
main()
Sanitize csv¶
-
sanitize_combined_results.
main
(infile=None)¶ Sanitize an Ursgal result file after combining results from multiple search engines (or single search engine results, with modified parameters)
- usage:
- ./sanitize_combined_results.py <Ursgal_result_file>
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import sys
import os
def main(infile=None):
"""
Sanitize an Ursgal result file after combining results
from multiple search engines (or single search engine results,
with modified parameters)
usage:
./sanitize_combined_results.py <Ursgal_result_file>
"""
mass_spectrometer = "QExactive+"
params = {
"precursor_mass_tolerance_minus": 10,
"precursor_mass_tolerance_plus": 10,
"frag_mass_tolerance": 10,
"frag_mass_tolerance_unit": "ppm",
"-xmx": "32g",
}
uc = ursgal.UController(profile=mass_spectrometer, params=params)
# Parameters need to be adjusted based on the input file
# and the desired type of sanitizing.
# E.g. 'combined PEP' or 'Bayed PEP' can be used for combined PEP results,
# while 'PEP' should be used for results from single engines.
# A minimum difference between the top scoring, conflicting PSMs can be defined
# using the parameters 'score_diff_threshold' and 'threshold_is_log10'
uc.params.update(
{
"validation_score_field": "Bayes PEP",
# 'validation_score_field': 'PEP',
"bigger_scores_better": False,
"num_compared_psms": 25,
"accept_conflicting_psms": False,
"threshold_is_log10": True,
"score_diff_threshold": 0.0,
"psm_defining_colnames": [
"Spectrum Title",
"Sequence",
# 'Modifications',
# 'Charge',
# 'Is decoy',
],
"max_num_psms_per_spec": 1,
# 'preferred_engines': [],
"preferred_engines": [
"msfragger_2_3",
"pipi_1_4_6",
"moda_v1_61",
],
"remove_redundant_psms": False,
}
)
sanitized_combined_results = uc.execute_misc_engine(
input_file=infile,
engine="sanitize_csv",
)
if __name__ == "__main__":
main(
infile=sys.argv[1],
)
Test node execution¶
-
test_node_execution.
main
()¶ Testscript for executing the test node, which also tests the run time determination function.
- Usage:
- ./test_node_excution.py
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import os.path
import sys
import tempfile
import time
def main():
"""
Testscript for executing the test node, which also tests the run time
determination function.
Usage:
./test_node_excution.py
"""
uc = ursgal.UController(verbose=True)
temp_int, temp_fpath = tempfile.mkstemp(prefix="ursgal_", suffix=".csv")
temp_fobject = open(temp_fpath, "w")
print("test 1,2,3", file=temp_fobject)
temp_fobject.close()
test_1 = uc.execute_unode(temp_fpath, "_test_node")
test_2 = uc.execute_unode(test_1, "_test_node")
if __name__ == "__main__":
main()
Complete Workflow Scripts¶
Do it all folder wide¶
-
do_it_all_folder_wide.
main
(folder=None, profile=None, target_decoy_database=None)¶ An example test script to search all mzML files which are present in the specified folder. The search is currently performed on 4 search engines and 2 validation engines.
The machine profile has to be specified as well as the target-decoy database.
usage:
./do_it_all_folder_wide.py <mzML_folder> <profile> <target_decoy_database>Current profiles:
- ‘QExactive+’
- ‘LTQ XL low res’
- ‘LTQ XL high res’
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import sys
import glob
import os
def main(folder=None, profile=None, target_decoy_database=None):
"""
An example test script to search all mzML files which are present in the
specified folder. The search is currently performed on 4 search engines
and 2 validation engines.
The machine profile has to be specified as well as the target-decoy
database.
usage:
./do_it_all_folder_wide.py <mzML_folder> <profile> <target_decoy_database>
Current profiles:
* 'QExactive+'
* 'LTQ XL low res'
* 'LTQ XL high res'
"""
# define folder with mzML_files as sys.argv[1]
mzML_files = []
for mzml in glob.glob(os.path.join("{0}".format(folder), "*.mzML")):
mzML_files.append(mzml)
mass_spectrometer = profile
# We specify all search engines and validation engines that we want to use in a list
# (version numbers might differ on windows or mac):
search_engines = [
"omssa",
"xtandem_vengeance",
"msgfplus_v2016_09_16",
# 'msamanda_1_0_0_6300',
# 'myrimatch_2_1_138',
]
validation_engines = [
"percolator_2_08",
"qvality",
]
# Modifications that should be included in the search
all_mods = [
"C,fix,any,Carbamidomethyl",
"M,opt,any,Oxidation",
# 'N,opt,any,Deamidated',
# 'Q,opt,any,Deamidated',
# 'E,opt,any,Methyl',
# 'K,opt,any,Methyl',
# 'R,opt,any,Methyl',
"*,opt,Prot-N-term,Acetyl",
# 'S,opt,any,Phospho',
# 'T,opt,any,Phospho',
# 'N,opt,any,HexNAc'
]
# Initializing the Ursgal UController class with
# our specified modifications and mass spectrometer
params = {
"database": target_decoy_database,
"modifications": all_mods,
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
],
}
uc = ursgal.UController(profile=mass_spectrometer, params=params)
# complete workflow:
# every spectrum file is searched with every search engine,
# results are validated (for each engine seperately),
# validated results are merged and filtered for targets and PEP <= 0.01.
# In the end, all filtered results from all spectrum files are merged
for validation_engine in validation_engines:
result_files = []
for spec_file in mzML_files:
validated_results = []
for search_engine in search_engines:
unified_search_results = uc.search(
input_file=spec_file,
engine=search_engine,
)
validated_csv = uc.validate(
input_file=unified_search_results,
engine=validation_engine,
)
validated_results.append(validated_csv)
validated_results_from_all_engines = uc.execute_misc_engine(
input_file=validated_results,
engine="merge_csvs_1_0_0",
)
filtered_validated_results = uc.execute_misc_engine(
input_file=validated_results_from_all_engines,
engine="filter_csv_1_0_0",
)
result_files.append(filtered_validated_results)
results_all_files = uc.execute_misc_engine(
input_file=result_files,
engine="merge_csvs_1_0_0",
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print(main.__doc__)
sys.exit(1)
main(
folder=sys.argv[1],
profile=sys.argv[2],
target_decoy_database=sys.argv[3],
)
Large scale data analysis¶
-
barth_et_al_large_scale.
main
(folder)¶ Example script for reproducing the data for figure 3
usage:
./barth_et_al_large_scale.py <folder>The folder determines the target folder where the files will be downloaded
Chlamydomonas reinhardtii samples
Three biological replicates of 4 conditions (2_3, 2_4, 3_1, 4_1)
For more details on the samples please refer to Barth, J.; Bergner, S. V.; Jaeger, D.; Niehues, A.; Schulze, S.; Scholz, M.; Fufezan, C. The interplay of light and oxygen in the reactive oxygen stress response of Chlamydomonas reinhardtii dissected by quantitative mass spectrometry. MCP 2014, 13 (4), 969–989.
Merge all search results (per biological replicate and condition, on folder level) on engine level and validate via percolator.
‘LTQ XL high res’:
- repetition 1
- repetition 2
‘LTQ XL low res’:
- repetition 3
Database:
- Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta
Note
The database and the files will be automatically downloaded from our webpage and peptideatlas
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import os.path
import sys
def main(folder):
"""
Example script for reproducing the data for figure 3
usage:
./barth_et_al_large_scale.py <folder>
The folder determines the target folder where the files will be downloaded
Chlamydomonas reinhardtii samples
Three biological replicates of 4 conditions (2_3, 2_4, 3_1, 4_1)
For more details on the samples please refer to
Barth, J.; Bergner, S. V.; Jaeger, D.; Niehues, A.; Schulze, S.; Scholz,
M.; Fufezan, C. The interplay of light and oxygen in the reactive oxygen
stress response of Chlamydomonas reinhardtii dissected by quantitative mass
spectrometry. MCP 2014, 13 (4), 969–989.
Merge all search results (per biological replicate and condition, on folder
level) on engine level and validate via percolator.
'LTQ XL high res':
* repetition 1
* repetition 2
'LTQ XL low res':
* repetition 3
Database:
* Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta
Note:
The database and the files will be automatically downloaded from our
webpage and peptideatlas
"""
input_params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"modifications": [
"M,opt,any,Oxidation",
"*,opt,Prot-N-term,Acetyl", # N-Acetylation
],
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_output_folder_root": folder,
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
uc = ursgal.UController(params=input_params)
if os.path.exists(input_params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
output_folder_to_file_list = {
("rep1_sample_2_3", "LTQ XL high res"): [
"CF_07062012_pH8_2_3A.mzML",
"CF_13062012_pH3_2_3A.mzML",
"CF_13062012_pH4_2_3A.mzML",
"CF_13062012_pH5_2_3A.mzML",
"CF_13062012_pH6_2_3A.mzML",
"CF_13062012_pH11FT_2_3A.mzML",
],
("rep1_sample_2_4", "LTQ XL high res"): [
"CF_07062012_pH8_2_4A.mzML",
"CF_13062012_pH3_2_4A_120615113039.mzML",
"CF_13062012_pH4_2_4A.mzML",
"CF_13062012_pH5_2_4A.mzML",
"CF_13062012_pH6_2_4A.mzML",
"CF_13062012_pH11FT_2_4A.mzML",
],
("rep1_sample_3_1", "LTQ XL high res"): [
"CF_12062012_pH8_1_3A.mzML",
"CF_13062012_pH3_1_3A.mzML",
"CF_13062012_pH4_1_3A.mzML",
"CF_13062012_pH5_1_3A.mzML",
"CF_13062012_pH6_1_3A.mzML",
"CF_13062012_pH11FT_1_3A.mzML",
],
("rep1_sample_4_1", "LTQ XL high res"): [
"CF_07062012_pH8_1_4A.mzML",
"CF_13062012_pH3_1_4A.mzML",
"CF_13062012_pH4_1_4A.mzML",
"CF_13062012_pH5_1_4A.mzML",
"CF_13062012_pH6_1_4A.mzML",
"CF_13062012_pH11FT_1_4A.mzML",
],
("rep2_sample_2_3", "LTQ XL high res"): [
"JB_18072012_2-3_A_FT.mzML",
"JB_18072012_2-3_A_pH3.mzML",
"JB_18072012_2-3_A_pH4.mzML",
"JB_18072012_2-3_A_pH5.mzML",
"JB_18072012_2-3_A_pH6.mzML",
"JB_18072012_2-3_A_pH8.mzML",
],
("rep2_sample_2_4", "LTQ XL high res"): [
"JB_18072012_2-4_A_FT.mzML",
"JB_18072012_2-4_A_pH3.mzML",
"JB_18072012_2-4_A_pH4.mzML",
"JB_18072012_2-4_A_pH5.mzML",
"JB_18072012_2-4_A_pH6.mzML",
"JB_18072012_2-4_A_pH8.mzML",
],
("rep2_sample_3_1", "LTQ XL high res"): [
"JB_18072012_3-1_A_FT.mzML",
"JB_18072012_3-1_A_pH3.mzML",
"JB_18072012_3-1_A_pH4.mzML",
"JB_18072012_3-1_A_pH5.mzML",
"JB_18072012_3-1_A_pH6.mzML",
"JB_18072012_3-1_A_pH8.mzML",
],
("rep2_sample_4_1", "LTQ XL high res"): [
"JB_18072012_4-1_A_FT.mzML",
"JB_18072012_4-1_A_pH3.mzML",
"JB_18072012_4-1_A_pH4.mzML",
"JB_18072012_4-1_A_pH5.mzML",
"JB_18072012_4-1_A_pH6.mzML",
"JB_18072012_4-1_A_pH8.mzML",
],
("rep3_sample_2_3", "LTQ XL low res"): [
"JB_FASP_pH3_2-3_28122012.mzML",
"JB_FASP_pH4_2-3_28122012.mzML",
"JB_FASP_pH5_2-3_28122012.mzML",
"JB_FASP_pH6_2-3_28122012.mzML",
"JB_FASP_pH8_2-3_28122012.mzML",
"JB_FASP_pH11-FT_2-3_28122012.mzML",
],
("rep3_sample_2_4", "LTQ XL low res"): [
"JB_FASP_pH3_2-4_28122012.mzML",
"JB_FASP_pH4_2-4_28122012.mzML",
"JB_FASP_pH5_2-4_28122012.mzML",
"JB_FASP_pH6_2-4_28122012.mzML",
"JB_FASP_pH8_2-4_28122012.mzML",
"JB_FASP_pH11-FT_2-4_28122012.mzML",
],
("rep3_sample_3_1", "LTQ XL low res"): [
"JB_FASP_pH3_3-1_28122012.mzML",
"JB_FASP_pH4_3-1_28122012.mzML",
"JB_FASP_pH5_3-1_28122012.mzML",
"JB_FASP_pH6_3-1_28122012.mzML",
"JB_FASP_pH8_3-1_28122012.mzML",
"JB_FASP_pH11-FT_3-1_28122012.mzML",
],
("rep3_sample_4_1", "LTQ XL low res"): [
"JB_FASP_pH3_4-1_28122012.mzML",
"JB_FASP_pH4_4-1_28122012.mzML",
"JB_FASP_pH5_4-1_28122012.mzML",
"JB_FASP_pH6_4-1_28122012.mzML",
"JB_FASP_pH8_4-1_28122012.mzML",
"JB_FASP_pH11-FT_4-1_28122012_130121201449.mzML",
],
}
for (outfolder, profile), mzML_file_list in sorted(
output_folder_to_file_list.items()
):
uc.params["ftp_output_folder"] = os.path.join(
input_params["ftp_output_folder_root"], outfolder
)
uc.params["ftp_include_ext"] = mzML_file_list
if os.path.exists(uc.params["ftp_output_folder"]) is False:
os.makedirs(uc.params["ftp_output_folder"])
uc.fetch_file(engine="get_ftp_files_1_0_0")
if os.path.exists(input_params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
search_engines = [
"omssa_2_1_9",
"xtandem_piledriver",
"myrimatch_2_1_138",
"msgfplus_v9979",
"msamanda_1_0_0_5243",
]
# This dict will be populated with the percolator-validated results
# of each engine ( 3 replicates x4 conditions = 12 files each )
percolator_results = {
"omssa_2_1_9": [],
"xtandem_piledriver": [],
"msgfplus_v9979": [],
"myrimatch_2_1_138": [],
"msamanda_1_0_0_5243": [],
}
five_files_for_venn_diagram = []
for search_engine in search_engines:
# This list will collect all 12 result files for each engine,
# after Percolator validation and filtering for PSMs with a
# FDR <= 0.01
filtered_results_of_engine = []
for mzML_dir_ext, mass_spectrometer in output_folder_to_file_list.keys():
# for mass_spectrometer, replicate_dir in replicates:
# for condition_dir in conditions:
uc.set_profile(mass_spectrometer)
mzML_dir = os.path.join(
input_params["ftp_output_folder_root"], mzML_dir_ext
)
# i.e. /media/plan-f/mzML/Christian_Fufezan/ROS_Experiment_2012/Juni_2012/2_3/Tech_A/
# all files ending with .mzml in that directory will be used!
unified_results_list = []
for filename in glob.glob(os.path.join(mzML_dir, "*.mzML")):
# print(filename)
if filename.lower().endswith(".mzml"):
# print(filename)
unified_search_results = uc.search(
input_file=filename,
engine=search_engine,
)
unified_results_list.append(unified_search_results)
# Merging results from the 6 pH-fractions:
merged_unified = uc.execute_misc_engine(
input_file=unified_results_list,
engine="merge_csvs_1_0_0",
)
# Validation with Percolator:
percolator_validated = uc.validate(
input_file=merged_unified,
engine="percolator_2_08", # one could replace this with 'qvality'
)
percolator_results[search_engine].append(percolator_validated)
# At this point, the analysis is finished. We got
# Percolator-validated results for each of the 3
# replicates and 12 conditions.
# But let's see how well the five search engines
# performed! To compare, we collect all PSMs with
# an estimated FDR <= 0.01 for each engine, and
# plot this information with the VennDiagram UNode.
# We will also use the Combine FDR Score method
# to combine the results from all five engines,
# and increase the number of identified peptides.
five_large_merged = []
filtered_final_results = []
# We will estimate the FDR for all 60 files
# (5 engines x12 files) when using percolator PEPs as
# quality score
uc.params["validation_score_field"] = "PEP"
uc.params["bigger_scores_better"] = False
# To make obtain smaller CSV files (and make plotting
# less RAM-intensive, we remove all decoys and PSMs above
# 0.06 FDR
uc.params["csv_filter_rules"] = [
["estimated_FDR", "lte", 0.06],
["Is decoy", "equals", "false"],
]
for engine, percolator_validated_list in percolator_results.items():
# unfiltered files for cFDR script
twelve_merged = uc.execute_misc_engine(
input_file=percolator_validated_list,
engine="merge_csvs_1_0_0",
)
twelve_filtered = []
for one_of_12 in percolator_validated_list:
one_of_12_FDR = uc.validate(
input_file=one_of_12, engine="add_estimated_fdr_1_0_0"
)
one_of_12_FDR_filtered = uc.execute_misc_engine(
input_file=one_of_12_FDR, engine="filter_csv_1_0_0"
)
twelve_filtered.append(one_of_12_FDR_filtered)
# For the combined FDR scoring, we merge all 12 files:
filtered_merged = uc.execute_misc_engine(
input_file=twelve_filtered, engine="merge_csvs_1_0_0"
)
five_large_merged.append(twelve_merged)
filtered_final_results.append(filtered_merged)
# The five big merged files of each engine are combined:
cFDR = uc.combine_search_results(
input_files=five_large_merged,
engine="combine_FDR_0_1",
)
# We estimate the FDR of this combined approach:
uc.params["validation_score_field"] = "Combined FDR Score"
uc.params["bigger_scores_better"] = False
cFDR_FDR = uc.validate(input_file=cFDR, engine="add_estimated_fdr_1_0_0")
# Removing decoys and low quality hits, to obtain a
# smaller file:
uc.params["csv_filter_rules"] = [
["estimated_FDR", "lte", 0.06],
["Is decoy", "equals", "false"],
]
cFDR_filtered_results = uc.execute_misc_engine(
input_file=cFDR_FDR,
engine="filter_csv_1_0_0",
)
filtered_final_results.append(cFDR_filtered_results)
# Since we produced quite a lot of files, let's print the full
# paths to our most important result files so we find them quickly:
print(
"""
These files can now be easily parsed and plotted with your
plotting tool of choice! We used the Python plotting library
matplotlib. Each unique combination of Sequence, modification
and charge was counted as a unique peptide.
"""
)
print("\n########### Result files: ##############")
for result_file in filtered_final_results:
print("\t*{0}".format(result_file))
if __name__ == "__main__":
if len(sys.argv) < 2:
print(main.__doc__)
sys.exit(1)
main(sys.argv[1])
Complete workflow for human BR dataset analysis¶
-
human_br_complete_workflow.
main
(folder)¶ usage:
./human_br_complete_workflow.py <folder_with_human_br_files>This scripts produces the data for figure 3.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import sys
import pprint
def main(folder):
"""
usage:
./human_br_complete_workflow.py <folder_with_human_br_files>
This scripts produces the data for figure 3.
"""
# Initialize the UController:
uc = ursgal.UController(
params={
"enzyme": "trypsin",
"decoy_generation_mode": "reverse_protein",
}
)
# MS Spectra, downloaded from http://proteomecentral.proteomexchange.org
# via the dataset accession PXD000263 and converted to mzML
mass_spec_files = [
"120813OTc1_NQL-AU-0314-LFQ-LCM-SG-01_013.mzML",
"120813OTc1_NQL-AU-0314-LFQ-LCM-SG-02_025.mzML",
"120813OTc1_NQL-AU-0314-LFQ-LCM-SG-03_033.mzML",
"120813OTc1_NQL-AU-0314-LFQ-LCM-SG-04_048.mzML",
]
for mass_spec_file in mass_spec_files:
if os.path.exists(os.path.join(folder, mass_spec_file)) is False:
print(
"Please download RAW files to folder {} and convert to mzML:".format(
folder
)
)
pprint.pprint(mass_spec_files)
sys.exit(1)
# mods from Wen et al. (2015):
modifications = [
# Carbamidomethyl (C) was set as fixed modification
"C,fix,any,Carbamidomethyl",
"M,opt,any,Oxidation", # Oxidation (M) as well as
# Deamidated (NQ) were set as optional modification
"N,opt,any,Deamidated",
# Deamidated (NQ) were set as optional modification
"Q,opt,any,Deamidated",
]
# The target peptide database which will be searched (UniProt Human
# reference proteome from July 2013)
target_database = "uniprot_human_UP000005640_created_until_20130707.fasta"
# Let's turn it into a target decoy database by reversing peptides:
target_decoy_database = uc.execute_misc_engine(
input_file=target_database, engine="generate_target_decoy_1_0_0"
)
# OMSSA parameters from Wen et al. (2015):
omssa_params = {
# (used by default) # -w
"he": "1000", # -he 1000
"zcc": "1", # -zcc 1
"frag_mass_tolerance": "0.6", # -to 0.6
"frag_mass_tolerance_unit": "da", # -to 0.6
"precursor_mass_tolerance_minus": "10", # -te 10
"precursor_mass_tolerance_plus": "10", # -te 10
"precursor_mass_tolerance_unit": "ppm", # -teppm
"score_a_ions": False, # -i 1,4
"score_b_ions": True, # -i 1,4
"score_c_ions": False, # -i 1,4
"score_x_ions": False, # -i 1,4
"score_y_ions": True, # -i 1,4
"score_z_ions": False, # -i 1,4
"enzyme": "trypsin_p", # -e 10
"maximum_missed_cleavages": "1", # -v 1
"precursor_max_charge": "8", # -zh 8
"precursor_min_charge": "1", # -zl 1
"tez": "1", # -tez 1
"precursor_isotope_range": "0,1", # -ti 1
"num_match_spec": "1", # -hc 1
"database": target_decoy_database,
"modifications": modifications,
}
# MS-GF+ parameters from Wen et al. (2015):
msgf_params = {
# precursor ion mass tolerance was set to 10 ppm
"precursor_mass_tolerance_unit": "ppm",
# precursor ion mass tolerance was set to 10 ppm
"precursor_mass_tolerance_minus": "10",
# precursor ion mass tolerance was set to 10 ppm
"precursor_mass_tolerance_plus": "10",
# the max number of optional modifications per peptide were set as 3
# (used by default) # number of allowed isotope errors was set as 1
"enzyme": "trypsin", # the enzyme was set as trypsin
# (used by default) # fully enzymatic peptides were specified, i.e. no non-enzymatic termini
"frag_method": "1", # the fragmentation method selected in the search was CID
"max_pep_length": "45", # the maximum peptide length to consider was set as 45
# the minimum precursor charge to consider if charges are not specified
# in the spectrum file was set as 1
"precursor_min_charge": "1",
# the maximum precursor charge to consider was set as 8
"precursor_max_charge": "8",
# (used by default) # the parameter 'addFeatures' was set as 1 (required for Percolator)
# all of the other parameters were set as default
# the instrument selected
# was High-res
"database": target_decoy_database,
"modifications": modifications,
}
# X!Tandem parameters from Wen et al. (2015):
xtandem_params = {
# precursor ion mass tolerance was set to 10 ppm
"precursor_mass_tolerance_unit": "ppm",
# precursor ion mass tolerance was set to 10 ppm
"precursor_mass_tolerance_minus": "10",
# precursor ion mass tolerance was set to 10 ppm
"precursor_mass_tolerance_plus": "10",
# the fragment ion mass tolerance was set to 0.6 Da
"frag_mass_tolerance": "0.6",
# the fragment ion mass tolerance was set to 0.6 Da
"frag_mass_tolerance_unit": "da",
# parent monoisotopic mass isotope error was set as 'yes'
"precursor_isotope_range": "0,1",
"precursor_max_charge": "8", # maximum parent charge of spectrum was set as 8
"enzyme": "trypsin", # the enzyme was set as trypsin ([RK]|[X])
# the maximum missed cleavage sites were set as 1
"maximum_missed_cleavages": "1",
# (used by default) # no model refinement was employed.
"database": target_decoy_database,
"modifications": modifications,
}
search_engine_settings = [
# not used in Wen et al., so we use the same settings as xtandem
("msamanda_1_0_0_5243", xtandem_params, "LTQ XL high res"),
# not used in Wen et al., so we use the same settings as xtandem
("myrimatch_2_1_138", xtandem_params, "LTQ XL high res"),
# the instrument selected was High-res
("msgfplus_v9979", msgf_params, "LTQ XL high res"),
("xtandem_jackhammer", xtandem_params, None),
("omssa_2_1_9", omssa_params, None),
]
merged_validated_files_3_engines = []
merged_validated_files_5_engines = []
for engine, wen_params, instrument in search_engine_settings:
# Initializing the uPLANIT UController class with
# our specified modifications and mass spectrometer
uc = ursgal.UController(params=wen_params)
if instrument is not None:
uc.set_profile(instrument)
unified_results = []
percolator_validated_results = []
for mzML_file in mass_spec_files:
unified_search_results = uc.search(
input_file=mzML_file,
engine=engine,
)
unified_results.append(unified_search_results)
validated_csv = uc.validate(
input_file=unified_search_results,
engine="percolator_2_08",
)
percolator_validated_results.append(validated_csv)
merged_validated_csv = uc.execute_misc_engine(
input_file=percolator_validated_results, engine="merge_csvs_1_0_0"
)
merged_unvalidated_csv = uc.execute_misc_engine(
input_file=unified_results,
engine="merge_csvs_1_0_0",
)
if engine in ["omssa_2_1_9", "xtandem_jackhammer", "msgfplus_v9979"]:
merged_validated_files_3_engines.append(merged_validated_csv)
merged_validated_files_5_engines.append(merged_validated_csv)
uc.params["prefix"] = "5-engines-summary"
uc.combine_search_results(
input_files=merged_validated_files_5_engines,
engine="combine_FDR_0_1",
)
uc.params["prefix"] = "3-engines-summary"
uc.combine_search_results(
input_files=merged_validated_files_3_engines,
engine="combine_FDR_0_1",
)
if __name__ == "__main__":
if len(sys.argv) < 2:
print(main.__doc__)
sys.exit(1)
main(sys.argv[1])
Complete workflow for analysis with pGlyco¶
-
example_pglyco_workflow.
main
(input_raw_file=None, database=None, enzyme=None)¶ - This script executes three steps:
- convert .raw file using pParse
- search the resulting .mgf file using pGlyco
- validate results using pGlycoFDR
Since pGlyco does not support providing a custom target-decoy database, a database including only target sequences must be provided. In this database, the N-glycosylation motif N-X-S/T needs to be replaced with J-X-S/T, which will be done by the pGlyco wrapper.
- usage:
- ./simple_example_pglyco_workflow.py <input_raw_file> <database> <enzyme>
#!/usr/bin/env python
# encoding: utf-8
import ursgal
import os
import sys
import shutil
def main(input_raw_file=None, database=None, enzyme=None):
"""
This script executes three steps:
- convert .raw file using pParse
- search the resulting .mgf file using pGlyco
- validate results using pGlycoFDR
Since pGlyco does not support providing a custom target-decoy database,
a database including only target sequences must be provided.
In this database, the N-glycosylation motif N-X-S/T needs to be replaced
with J-X-S/T, which will be done by the pGlyco wrapper.
usage:
./simple_example_pglyco_workflow.py <input_raw_file> <database> <enzyme>
"""
uc = ursgal.UController(
profile="QExactive+",
params={
"database": database,
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
"peptide_mapper_class_version": "UPeptideMapper_v4",
"enzyme": enzyme,
"frag_mass_tolerance": 20,
"frag_mass_tolerance_unit": "ppm",
"precursor_mass_tolerance_plus": 5,
"precursor_mass_tolerance_minus": 5,
"aa_exception_dict": {},
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["q-value", "lte", 0.01],
["Conflicting uparam", "contains_not", "enzyme"],
],
},
)
xtracted_file = uc.convert(
input_file=input_raw_file,
engine="pparse_2_2_1",
# force = True,
)
mzml_file = uc.convert(
input_file=input_raw_file,
engine="thermo_raw_file_parser_1_1_2",
)
# os.remove(mzml_file.replace('.mzML', '.mgf'))
# os.remove(mzml_file.replace('.mzML', '.mgf.u.json'))
mgf_file = uc.convert(
input_file=mzml_file,
engine="mzml2mgf_2_0_0",
force=True,
)
search_result = uc.search_mgf(
input_file=xtracted_file,
engine="pglyco_db_2_2_2",
)
mapped_results = uc.execute_misc_engine(
input_file=search_result,
engine="upeptide_mapper",
)
unified_search_results = uc.execute_misc_engine(
input_file=mapped_results, engine="unify_csv"
)
validated_file = uc.validate(
input_file=search_result,
engine="pglyco_fdr_2_2_0",
)
mapped_validated_results = uc.execute_misc_engine(
input_file=validated_file,
engine="upeptide_mapper",
)
unified_validated_results = uc.execute_misc_engine(
input_file=mapped_validated_results, engine="unify_csv"
)
filtered_validated_results = uc.execute_misc_engine(
input_file=unified_validated_results,
engine="filter_csv",
)
return
if __name__ == "__main__":
main(input_raw_file=sys.argv[1], database=sys.argv[2], enzyme=sys.argv[3])
Complete workflow for analysis with TagGraph¶
-
example_pglyco_workflow.
main
(input_raw_file=None, database=None, enzyme=None) - This script executes three steps:
- convert .raw file using pParse
- search the resulting .mgf file using pGlyco
- validate results using pGlycoFDR
Since pGlyco does not support providing a custom target-decoy database, a database including only target sequences must be provided. In this database, the N-glycosylation motif N-X-S/T needs to be replaced with J-X-S/T, which will be done by the pGlyco wrapper.
- usage:
- ./simple_example_pglyco_workflow.py <input_raw_file> <database> <enzyme>
#!/usr/bin/env python
# encoding: utf-8
import ursgal
import os
import sys
import shutil
def main(input_raw_file=None, database=None, enzyme=None):
"""
This script executes three steps:
- convert .raw file using pParse
- search the resulting .mgf file using pGlyco
- validate results using pGlycoFDR
Since pGlyco does not support providing a custom target-decoy database,
a database including only target sequences must be provided.
In this database, the N-glycosylation motif N-X-S/T needs to be replaced
with J-X-S/T, which will be done by the pGlyco wrapper.
usage:
./simple_example_pglyco_workflow.py <input_raw_file> <database> <enzyme>
"""
uc = ursgal.UController(
profile="QExactive+",
params={
"database": database,
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
"peptide_mapper_class_version": "UPeptideMapper_v4",
"enzyme": enzyme,
"frag_mass_tolerance": 20,
"frag_mass_tolerance_unit": "ppm",
"precursor_mass_tolerance_plus": 5,
"precursor_mass_tolerance_minus": 5,
"aa_exception_dict": {},
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["q-value", "lte", 0.01],
["Conflicting uparam", "contains_not", "enzyme"],
],
},
)
xtracted_file = uc.convert(
input_file=input_raw_file,
engine="pparse_2_2_1",
# force = True,
)
mzml_file = uc.convert(
input_file=input_raw_file,
engine="thermo_raw_file_parser_1_1_2",
)
# os.remove(mzml_file.replace('.mzML', '.mgf'))
# os.remove(mzml_file.replace('.mzML', '.mgf.u.json'))
mgf_file = uc.convert(
input_file=mzml_file,
engine="mzml2mgf_2_0_0",
force=True,
)
search_result = uc.search_mgf(
input_file=xtracted_file,
engine="pglyco_db_2_2_2",
)
mapped_results = uc.execute_misc_engine(
input_file=search_result,
engine="upeptide_mapper",
)
unified_search_results = uc.execute_misc_engine(
input_file=mapped_results, engine="unify_csv"
)
validated_file = uc.validate(
input_file=search_result,
engine="pglyco_fdr_2_2_0",
)
mapped_validated_results = uc.execute_misc_engine(
input_file=validated_file,
engine="upeptide_mapper",
)
unified_validated_results = uc.execute_misc_engine(
input_file=mapped_validated_results, engine="unify_csv"
)
filtered_validated_results = uc.execute_misc_engine(
input_file=unified_validated_results,
engine="filter_csv",
)
return
if __name__ == "__main__":
main(input_raw_file=sys.argv[1], database=sys.argv[2], enzyme=sys.argv[3])
Example search for 15N labeling and no label¶
-
search_with_label_15N.
main
()¶ Executes a search with 3 different search engines on an example file from the data from Barth et al. Two searches are performed pe engine for each 14N (unlabeled) and 15N labeling. The overlap of identified peptides for the 14N and 15N searches between the engines is visualized as well as the overlap between all 14N and 15N identified peptides.
- usage:
- ./search_with_label_15N.py
Note
It is important to convert the mgf file outside of (and before :) ) the search loop to avoid mgf file redundancy and to assure correct retention time mapping in the unify_csv node.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Executes a search with 3 different search engines on an example file from
the data from Barth et al. Two searches are performed pe engine for each
14N (unlabeled) and 15N labeling. The overlap of identified peptides
for the 14N and 15N searches between the engines is visualized as well as
the overlap between all 14N and 15N identified peptides.
usage:
./search_with_label_15N.py
Note:
It is important to convert the mgf file outside of (and before :) ) the
search loop to avoid mgf file redundancy and to assure correct retention
time mapping in the unify_csv node.
"""
engine_list = [
"omssa_2_1_9",
"xtandem_piledriver",
"msgfplus_v9979",
]
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"modifications": [],
"csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]],
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
],
"ftp_output_folder": os.path.join(
os.pardir, "example_data", "search_with_label_15N"
),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
if os.path.exists(params["ftp_output_folder"]) is False:
os.mkdir(params["ftp_output_folder"])
uc = ursgal.UController(profile="LTQ XL low res", params=params)
mzML_file = os.path.join(params["ftp_output_folder"], params["ftp_include_ext"][0])
if os.path.exists(mzML_file) is False:
uc.fetch_file(engine="get_ftp_files_1_0_0")
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
mgf_file = uc.convert(
input_file=mzML_file,
engine="mzml2mgf_1_0_0",
)
files_2_merge = {}
label_list = ["14N", "15N"]
for label in label_list:
validated_and_filtered_files_list = []
uc.params["label"] = label
uc.params["prefix"] = label
for engine in engine_list:
search_result = uc.search_mgf(
input_file=mgf_file,
engine=engine,
)
converted_result = uc.convert(
input_file=search_result,
guess_engine=True,
)
mapped_results = uc.execute_misc_engine(
input_file=converted_result,
engine="upeptide_mapper",
)
unified_search_results = uc.execute_misc_engine(
input_file=mapped_results, engine="unify_csv"
)
validated_file = uc.validate(
input_file=unified_search_results,
engine="percolator_2_08",
)
filtered_file = uc.execute_misc_engine(
input_file=validated_file, engine="filter_csv"
)
validated_and_filtered_files_list.append(filtered_file)
files_2_merge[label] = validated_and_filtered_files_list
uc.visualize(
input_files=validated_and_filtered_files_list,
engine="venndiagram_1_1_0",
)
uc.params["prefix"] = None
uc.params["label"] = ""
uc.params["visualization_label_positions"] = {}
label_comparison_file_list = []
for n, label in enumerate(label_list):
uc.params["visualization_label_positions"][str(n)] = label
label_comparison_file_list.append(
uc.execute_misc_engine(input_file=files_2_merge[label], engine="merge_csvs")
)
uc.visualize(
input_files=label_comparison_file_list,
engine="venndiagram_1_1_0",
)
return
if __name__ == "__main__":
main()
Open Modification Search Scripts¶
Open modification search with three engines and combined PEP¶
-
open_modification_search_incl_combined_pep.
main
(folder=None, database=None, enzyme=None)¶ Example workflow to perform a open modification search with three independent search engines across all mzML files of a given folder and to statistically post-process and combine the results of all searches.
- Usage:
- ./open_modification_search_incl_combined_pep.py <mzML_folder> <database> <enzyme>
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import sys
import glob
import os
import pprint
def main(folder=None, database=None, enzyme=None):
"""
Example workflow to perform a open modification search with three independent search engines
across all mzML files of a given folder and to statistically post-process and combine the
results of all searches.
Usage:
./open_modification_search_incl_combined_pep.py <mzML_folder> <database> <enzyme>
"""
# For this particular dataset, two enzymes were used, namely gluc and trypsin.
mzml_files = []
for mzml in glob.glob(os.path.join(folder, "*.mzML")):
mzml_files.append(mzml)
mass_spectrometer = "QExactive+"
validation_engine = "percolator_3_4_0"
search_engines = ["msfragger_2_3", "pipi_1_4_6", "moda_v1_61"]
params = {
"modifications": ["C,fix,any,Carbamidomethyl"],
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
],
"frag_mass_tolerance_unit": "ppm",
"frag_mass_tolerance": 20,
"precursor_mass_tolerance_unit": "ppm",
"precursor_mass_tolerance_plus": 5,
"precursor_mass_tolerance_minus": 5,
"moda_high_res": False,
"max_mod_size": 4000,
"min_mod_size": -200,
"precursor_true_units": "ppm",
"precursor_true_tolerance": 5,
"percolator_post_processing": "mix-max",
"psm_defining_colnames": [
"Spectrum Title",
"Sequence",
"Modifications",
"Charge",
"Is decoy",
"Mass Difference",
],
"database": database,
"enzyme": enzyme,
}
uc = ursgal.UController(
profile=mass_spectrometer,
params=params,
)
# This will hold input to combined PEP engine
combined_pep_input = defaultdict(list)
# This dictionary will help organize which results to merge
all_merged_results = defaultdict(list)
for search_engine in search_engines:
# The modification size for MSFragger is configured through precursor mass tolerance
if search_engine == "msfragger_2_3":
uc.params.update(
{
"precursor_mass_tolerance_unit": "da",
"precursor_mass_tolerance_plus": 4000,
"precursor_mass_tolerance_minus": 200,
}
)
for n, spec_file in enumerate(mzml_files):
# 1. convert to MGF
mgf_file = uc.convert(
input_file=spec_file,
engine="mzml2mgf_2_0_0",
)
# 2. do the actual search
raw_search_results = uc.search_mgf(
input_file=mgf_file,
engine=search_engine,
)
# reset precursor mass tolerance just in case it was previously changed
uc.params.update(
{
"precursor_mass_tolerance_unit": "ppm",
"precursor_mass_tolerance_plus": 5,
"precursor_mass_tolerance_minus": 5,
}
)
# 3. convert files to csv
csv_search_results = uc.convert(
input_file=raw_search_results,
engine=None,
guess_engine=True,
)
# 4. protein mapping.
mapped_csv_search_results = uc.execute_misc_engine(
input_file=csv_search_results,
engine="upeptide_mapper_1_0_0",
)
# 5. Convert csv to unified ursgal csv format:
unified_search_results = uc.execute_misc_engine(
input_file=mapped_csv_search_results,
engine="unify_csv_1_0_0",
merge_duplicates=False,
)
# 6. Validate the results
validated_csv = uc.validate(
input_file=unified_search_results,
engine=validation_engine,
)
# save the validated input for combined pep
# Eventually, each sample will have 3 files correpsonding to the 3 search engines
combined_pep_input["sample_{0}".format(n)].append(validated_csv)
filtered_validated_results = uc.execute_misc_engine(
input_file=validated_csv,
engine="filter_csv_1_0_0",
merge_duplicates=False,
)
all_merged_results["percolator_only"].append(filtered_validated_results)
# combined pep
uc.params.update(
{
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["combined PEP", "lte", 0.01],
],
"psm_defining_colnames": [
"Spectrum Title",
"Sequence",
"Modifications",
"Charge",
"Is decoy",
],
}
)
for sample in combined_pep_input.keys():
combine_results = uc.execute_misc_engine(
input_file=combined_pep_input[sample],
engine="combine_pep_1_0_0",
)
filtered_validated_results = uc.execute_misc_engine(
input_file=combine_results,
engine="filter_csv_1_0_0",
)
all_merged_results["combined_pep"].append(filtered_validated_results)
# separately merge results from the two types of validation techniques
# We also add back "Mass Difference" to columns defining a PSM to avoid merging mass differences
uc.params.update(
{
"psm_defining_colnames": [
"Spectrum Title",
"Sequence",
"Modifications",
"Charge",
"Is decoy",
"Mass Difference",
],
}
)
for validation_type in all_merged_results.keys():
if validation_type == "percolator_only":
uc.params["psm_colnames_to_merge_multiple_values"] = {
"PEP": "min_value",
}
else:
uc.params["psm_colnames_to_merge_multiple_values"] = {
"combined PEP": "min_value",
"Bayes PEP": "min_value",
}
uc.params["prefix"] = "All_{0}".format(
validation_type
) # helps recognize files easily
merged_results_one_rep = uc.execute_misc_engine(
input_file=all_merged_results[validation_type],
engine="merge_csvs_1_0_0",
merge_duplicates=True,
)
uc.params["prefix"] = ""
if __name__ == "__main__":
main(
folder=sys.argv[1],
database=sys.argv[2],
enzyme=sys.argv[3],
)
Complete workflow for analysis with TagGraph¶
-
example_pglyco_workflow.
main
(input_raw_file=None, database=None, enzyme=None) - This script executes three steps:
- convert .raw file using pParse
- search the resulting .mgf file using pGlyco
- validate results using pGlycoFDR
Since pGlyco does not support providing a custom target-decoy database, a database including only target sequences must be provided. In this database, the N-glycosylation motif N-X-S/T needs to be replaced with J-X-S/T, which will be done by the pGlyco wrapper.
- usage:
- ./simple_example_pglyco_workflow.py <input_raw_file> <database> <enzyme>
#!/usr/bin/env python
# encoding: utf-8
import ursgal
import os
import sys
import shutil
def main(input_raw_file=None, database=None, enzyme=None):
"""
This script executes three steps:
- convert .raw file using pParse
- search the resulting .mgf file using pGlyco
- validate results using pGlycoFDR
Since pGlyco does not support providing a custom target-decoy database,
a database including only target sequences must be provided.
In this database, the N-glycosylation motif N-X-S/T needs to be replaced
with J-X-S/T, which will be done by the pGlyco wrapper.
usage:
./simple_example_pglyco_workflow.py <input_raw_file> <database> <enzyme>
"""
uc = ursgal.UController(
profile="QExactive+",
params={
"database": database,
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
"peptide_mapper_class_version": "UPeptideMapper_v4",
"enzyme": enzyme,
"frag_mass_tolerance": 20,
"frag_mass_tolerance_unit": "ppm",
"precursor_mass_tolerance_plus": 5,
"precursor_mass_tolerance_minus": 5,
"aa_exception_dict": {},
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["q-value", "lte", 0.01],
["Conflicting uparam", "contains_not", "enzyme"],
],
},
)
xtracted_file = uc.convert(
input_file=input_raw_file,
engine="pparse_2_2_1",
# force = True,
)
mzml_file = uc.convert(
input_file=input_raw_file,
engine="thermo_raw_file_parser_1_1_2",
)
# os.remove(mzml_file.replace('.mzML', '.mgf'))
# os.remove(mzml_file.replace('.mzML', '.mgf.u.json'))
mgf_file = uc.convert(
input_file=mzml_file,
engine="mzml2mgf_2_0_0",
force=True,
)
search_result = uc.search_mgf(
input_file=xtracted_file,
engine="pglyco_db_2_2_2",
)
mapped_results = uc.execute_misc_engine(
input_file=search_result,
engine="upeptide_mapper",
)
unified_search_results = uc.execute_misc_engine(
input_file=mapped_results, engine="unify_csv"
)
validated_file = uc.validate(
input_file=search_result,
engine="pglyco_fdr_2_2_0",
)
mapped_validated_results = uc.execute_misc_engine(
input_file=validated_file,
engine="upeptide_mapper",
)
unified_validated_results = uc.execute_misc_engine(
input_file=mapped_validated_results, engine="unify_csv"
)
filtered_validated_results = uc.execute_misc_engine(
input_file=unified_validated_results,
engine="filter_csv",
)
return
if __name__ == "__main__":
main(input_raw_file=sys.argv[1], database=sys.argv[2], enzyme=sys.argv[3])
Processing of mass differences using PTM-Shepherd¶
-
ptmshepherd_mass_difference_processing.
main
(folder=None, sanitized_search_results=None)¶ Downstream processing of mass differences from open modification search results using PTM-Shepherd. A folder containing all relevant mzML files and a sanitized (one PSM per spectrum) Ursgal result file (containing open modification search results) are required.
- usage:
- ./ptmshepherd_mass_difference_processing.py <folder_with_mzML> <sanitized_result_file>
#!/usr/bin/env python3
import os
import ursgal
import sys
import glob
from collections import defaultdict as ddict
import csv
def main(folder=None, sanitized_search_results=None):
"""
Downstream processing of mass differences from open modification search results
using PTM-Shepherd.
A folder containing all relevant mzML files and a sanitized (one PSM per spectrum)
Ursgal result file (containing open modification search results) are required.
usage:
./ptmshepherd_mass_difference_processing.py <folder_with_mzML> <sanitized_result_file>
"""
mzml_files = []
for mzml in glob.glob(os.path.join(folder, "*.mzML")):
mzml_files.append(mzml)
params = {
"use_pyqms_for_mz_calculation": True,
"cpus": 8,
"precursor_mass_tolerance_unit": "ppm",
"precursor_mass_tolerance_plus": 5,
"precursor_mass_tolerance_minus": 5,
"frag_mass_tolerance_unit": "ppm",
"frag_mass_tolerance": 20,
"modifications": [
"C,opt,any,Carbamidomethyl",
],
"-xmx": "12G",
"psm_defining_colnames": [
"Spectrum Title",
"Sequence",
# 'Modifications',
# 'Mass Difference',
# 'Charge',
# 'Is decoy',
],
"mzml_input_files": mzml_files,
"validation_score_field": "combined PEP",
"bigger_scores_better": False,
}
uc = ursgal.UController(params=params, profile="QExactive+", verbose=False)
ptmshepherd_results = uc.validate(
input_file=sanitized_search_results,
engine="ptmshepherd_0_3_5",
# force=True,
)
if __name__ == "__main__":
main(
folder=sys.argv[1],
sanitized_search_results=sys.argv[2],
)
Version Comparison Scripts¶
X!Tandem version comparison¶
-
xtandem_version_comparison.
main
()¶ Executes a search with 5 versions of X!Tandem on an example file from the data from Barth et al. 2014.
- usage:
- ./xtandem_version_comparison.py
This is a simple example file to show the straightforward comparison of different program versions of X!Tandem
Creates a Venn diagram with the peptides obtained by the different versions.
Note
At the moment in total 6 XTandem versions are incorporated in Ursgal.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Executes a search with 5 versions of X!Tandem on an example file from the
data from Barth et al. 2014.
usage:
./xtandem_version_comparison.py
This is a simple example file to show the straightforward comparison of
different program versions of X!Tandem
Creates a Venn diagram with the peptides obtained by the different versions.
Note:
At the moment in total 6 XTandem versions are incorporated in Ursgal.
"""
engine_list = [
# 'xtandem_cyclone',
"xtandem_jackhammer",
"xtandem_sledgehammer",
"xtandem_piledriver",
"xtandem_vengeance",
"xtandem_alanine",
]
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"modifications": [],
"csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]],
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
],
"ftp_output_folder": os.path.join(
os.pardir, "example_data", "xtandem_version_comparison"
),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
if os.path.exists(params["ftp_output_folder"]) is False:
os.mkdir(params["ftp_output_folder"])
uc = ursgal.UController(profile="LTQ XL low res", params=params)
mzML_file = os.path.join(params["ftp_output_folder"], params["ftp_include_ext"][0])
if os.path.exists(mzML_file) is False:
uc.fetch_file(engine="get_ftp_files_1_0_0")
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
filtered_files_list = []
for engine in engine_list:
unified_result_file = uc.search(
input_file=mzML_file,
engine=engine,
)
validated_file = uc.validate(
input_file=unified_result_file,
engine="percolator_2_08",
)
filtered_file = uc.execute_misc_engine(
input_file=validated_file,
engine="filter_csv_1_0_0",
)
filtered_files_list.append(filtered_file)
uc.visualize(
input_files=filtered_files_list,
engine="venndiagram_1_1_0",
)
return
if __name__ == "__main__":
main()
MSGF+ version comparison¶
-
msgf_version_comparison.
main
()¶ Executes a search with a current maximum of 3 versions of MSGF+ on an example file from the data from Barth et al. (2014)
- usage:
- ./msgf_version_comparison.py
This is a simple example file to show the straightforward comparison of different program versions of MSGF+
Creates a Venn diagram with the peptides obtained by the different versions.
An example plot can be found in the online documentation.
Note
Uses the new MS-GF+ C# mzid converter if available
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Executes a search with a current maximum of 3 versions of MSGF+ on an
example file from the data from Barth et al. (2014)
usage:
./msgf_version_comparison.py
This is a simple example file to show the straightforward comparison of
different program versions of MSGF+
Creates a Venn diagram with the peptides obtained by the different
versions.
An example plot can be found in the online documentation.
Note:
Uses the new MS-GF+ C# mzid converter if available
"""
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]],
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
],
"ftp_output_folder": os.path.join(
os.pardir, "example_data", "msgf_version_comparison"
),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
"remove_temporary_files": False,
}
if os.path.exists(params["ftp_output_folder"]) is False:
os.mkdir(params["ftp_output_folder"])
uc = ursgal.UController(profile="LTQ XL low res", params=params)
engine_list = [
"msgfplus_v9979",
"msgfplus_v2016_09_16",
]
if "msgfplus_v2017_01_27" in uc.unodes.keys():
if uc.unodes["msgfplus_v2017_01_27"]["available"]:
engine_list.append("msgfplus_v2017_01_27")
if "msgfplus_v2018_01_30" in uc.unodes.keys():
if uc.unodes["msgfplus_v2018_01_30"]["available"]:
engine_list.append("msgfplus_v2018_01_30")
if "msgfplus_C_mzid2csv_v2017_07_04" in uc.unodes.keys():
if uc.unodes["msgfplus_C_mzid2csv_v2017_07_04"]["available"]:
uc.params[
"msgfplus_mzid_converter_version"
] = "msgfplus_C_mzid2csv_v2017_07_04"
mzML_file = os.path.join(params["ftp_output_folder"], params["ftp_include_ext"][0])
if os.path.exists(mzML_file) is False:
uc.fetch_file(engine="get_ftp_files_1_0_0")
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
filtered_files_list = []
for engine in engine_list:
unified_result_file = uc.search(
input_file=mzML_file,
engine=engine,
)
validated_file = uc.validate(
input_file=unified_result_file,
engine="percolator_2_08",
)
filtered_file = uc.execute_misc_engine(
input_file=validated_file,
engine="filter_csv_1_0_0",
)
filtered_files_list.append(filtered_file)
uc.visualize(
input_files=filtered_files_list,
engine="venndiagram_1_1_0",
)
return
if __name__ == "__main__":
main()
Bruderer et al. (2015) X!Tandem comparison incl. machine ppm offset¶
-
xtandem_version_comparison_qexactive.
main
(folder)¶ Executes a search with 5 versions of X!Tandem on an example file from the data from Bruderer et al. 2015.
- usage:
- ./xtandem_version_comparison.py <folder containing B_D140314_SGSDSsample1_R01_MSG_T0.mzML>
This is a simple example file to show the straightforward comparison of different program versions of X!Tandem, similar to the example script ‘xtandem_version_comparison’, but analyzing high resolution data which can be better handled by version newer than Jackhammer. One gains approximately 10 percent more peptides with newer versions of X!Tandem.
Creates a Venn diagram with the peptides obtained by the different versions.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
import sys
def main(folder):
"""
Executes a search with 5 versions of X!Tandem on an example file from the
data from Bruderer et al. 2015.
usage:
./xtandem_version_comparison.py <folder containing B_D140314_SGSDSsample1_R01_MSG_T0.mzML>
This is a simple example file to show the straightforward comparison of
different program versions of X!Tandem, similar to the example script
'xtandem_version_comparison', but analyzing high resolution data
which can be better handled by version newer than Jackhammer. One gains
approximately 10 percent more peptides with newer versions of X!Tandem.
Creates a Venn diagram with the peptides obtained by the different versions.
"""
required_example_file = "B_D140314_SGSDSsample1_R01_MSG_T0.mzML"
if os.path.exists(os.path.join(folder, required_example_file)) is False:
print(
"""
Your specified folder does not contain the required example file:
{0}
The RAW data from peptideatlas.org (PASS00589, password: WF6554orn)
will be downloaded.
Please convert to mzML after the download has finished and run this
script again.
""".format(
required_example_file
)
)
ftp_get_params = {
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00589",
"ftp_password": "WF6554orn",
"ftp_include_ext": [required_example_file.replace(".mzML", ".raw")],
"ftp_output_folder": folder,
}
uc = ursgal.UController(params=ftp_get_params)
uc.fetch_file(engine="get_ftp_files_1_0_0")
sys.exit(1)
engine_list = [
"xtandem_cyclone",
"xtandem_jackhammer",
"xtandem_sledgehammer",
"xtandem_piledriver",
"xtandem_vengeance",
]
params = {
"database": os.path.join(
os.pardir, "example_data", "hs_201303_qs_sip_target_decoy.fasta"
),
"modifications": ["C,fix,any,Carbamidomethyl"],
"csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]],
"http_url": "http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
"machine_offset_in_ppm": -5e-6,
}
uc = ursgal.UController(profile="QExactive+", params=params)
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
mzML_file = os.path.join(folder, required_example_file)
filtered_files_list = []
for engine in engine_list:
unified_result_file = uc.search(
input_file=mzML_file,
engine=engine,
force=False,
)
validated_file = uc.validate(
input_file=unified_result_file,
engine="percolator_2_08",
)
filtered_file = uc.execute_misc_engine(
input_file=validated_file,
engine="filter_csv_1_0_0",
)
filtered_files_list.append(filtered_file)
uc.visualize(
input_files=filtered_files_list,
engine="venndiagram_1_1_0",
)
return
if __name__ == "__main__":
if len(sys.argv) < 2:
print(main.__doc__)
sys.exit(1)
main(sys.argv[1])
Parameter Optimization Scripts¶
BSA machine ppm offset example¶
-
bsa_ppm_offset_test.
main
()¶ Example script to do a simple machine ppm offset parameter sweep. The m/z values in the example mgf file are stepwise changed and the in the final output the total peptides are counted.
- usage:
- ./bsa_ppm_offset_test.py
Note
As expected, if the offset becomes to big no peptides can be found anymore.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import csv
from collections import defaultdict as ddict
import os
import shutil
def main():
"""
Example script to do a simple machine ppm offset parameter sweep.
The m/z values in the example mgf file are stepwise changed and the in the
final output the total peptides are counted.
usage:
./bsa_ppm_offset_test.py
Note:
As expected, if the offset becomes to big no peptides can be found anymore.
"""
ppm_offsets = [
(-10, "-10_ppm_offset"),
(-9, "-9_ppm_offset"),
(-8, "-8_ppm_offset"),
(-7, "-7_ppm_offset"),
(-6, "-6_ppm_offset"),
(-5, "-5_ppm_offset"),
(-4, "-4_ppm_offset"),
(-3, "-3_ppm_offset"),
(-2, "-2_ppm_offset"),
(-1, "-1_ppm_offset"),
(None, "0_ppm_offset"),
(1, "1_ppm_offset"),
(2, "2_ppm_offset"),
(3, "3_ppm_offset"),
(4, "4_ppm_offset"),
(5, "5_ppm_offset"),
(6, "6_ppm_offset"),
(7, "7_ppm_offset"),
(8, "8_ppm_offset"),
(9, "9_ppm_offset"),
(10, "10_ppm_offset"),
]
engine_list = ["xtandem_vengeance"]
R = ursgal.UController(
profile="LTQ XL low res",
params={
"database": os.path.join(os.pardir, "example_data", "BSA.fasta"),
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
},
)
mzML_file = os.path.join(
os.pardir, "example_data", "BSA_machine_ppm_offset_example", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
R.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
R.params["http_output_folder"] = os.path.dirname(mzML_file)
R.fetch_file(engine="get_http_files_1_0_0")
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
for engine in engine_list:
for (ppm_offset, prefix) in ppm_offsets:
R.params["machine_offset_in_ppm"] = ppm_offset
R.params["prefix"] = prefix
unified_search_result_file = R.search(
input_file=mzML_file,
engine=engine,
force=False,
)
collector = ddict(set)
for csv_path in glob.glob("{0}/*/*unified.csv".format(os.path.dirname(mzML_file))):
for line_dict in csv.DictReader(open(csv_path, "r")):
collector[csv_path].add(line_dict["Sequence"])
for csv_path, peptide_set in sorted(collector.items()):
file_name = os.path.basename(csv_path)
offset = file_name.split("_")[0]
print(
"Search with {0: >3} ppm offset found {1: >2} peptides".format(
offset, len(peptide_set)
)
)
return
if __name__ == "__main__":
main()
Precursor mass tolerance example¶
-
bsa_precursor_mass_tolerance_example.
main
()¶ Example script to do a precursor mass tolerance parameter sweep.
- usage:
- ./bsa_precursor_mass_tolerance_example.py
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import csv
from collections import defaultdict as ddict
import os
import shutil
def main():
"""
Example script to do a precursor mass tolerance parameter sweep.
usage:
./bsa_precursor_mass_tolerance_example.py
"""
precursor_mass_tolerance_list = [
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
]
engine_list = ["xtandem_vengeance"]
R = ursgal.UController(
profile="LTQ XL low res",
params={
"database": os.path.join(os.pardir, "example_data", "BSA.fasta"),
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
},
)
mzML_file = os.path.join(
os.pardir, "example_data", "BSA_precursor_mass_tolerance_example", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
R.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
R.params["http_output_folder"] = os.path.dirname(mzML_file)
R.fetch_file(engine="get_http_files_1_0_0")
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
# Convert mzML to MGF outside the loop, so this step is not repeated in
# the loop
mgf_file = R.convert(input_file=mzML_file, engine="mzml2mgf_1_0_0")
for engine in engine_list:
for precursor_mass_tolerance in precursor_mass_tolerance_list:
R.params["precursor_mass_tolerance_minus"] = precursor_mass_tolerance
R.params["precursor_mass_tolerance_plus"] = precursor_mass_tolerance
R.params["prefix"] = "{0}_precursor_mass_tolerance_".format(
precursor_mass_tolerance
)
unified_search_result_file = R.search(
input_file=mgf_file,
engine=engine,
force=False,
)
collector = ddict(set)
for csv_path in glob.glob("{0}/*/*unified.csv".format(os.path.dirname(mzML_file))):
for line_dict in csv.DictReader(open(csv_path, "r")):
collector[csv_path].add(line_dict["Sequence"])
for csv_path, peptide_set in sorted(collector.items()):
file_name = os.path.basename(csv_path)
tolerance = file_name.split("_")[0]
print(
"Search with {0: >2} ppm precursor mass tolerance found {1: >2} peptides".format(
tolerance, len(peptide_set)
)
)
return
if __name__ == "__main__":
main()
Fragment mass tolerance example¶
-
bsa_fragment_mass_tolerance_example.
main
()¶ Example script to do a fragment mass tolerance parameter sweep.
- usage:
- ./bsa_fragment_mass_tolerance_example.py
If the fragment mass tolerance becomes to small, ver few peptides are found. With this small sweep the actual min accuracy of a mass spectrometer can be estimated.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import csv
from collections import defaultdict as ddict
import os
import shutil
def main():
"""
Example script to do a fragment mass tolerance parameter sweep.
usage:
./bsa_fragment_mass_tolerance_example.py
If the fragment mass tolerance becomes to small, ver few peptides are found.
With this small sweep the actual min accuracy of a mass spectrometer can
be estimated.
"""
fragment_mass_tolerance_list = [
0.02,
0.04,
0.06,
0.08,
0.1,
0.2,
0.3,
0.4,
0.5,
]
engine_list = ["xtandem_vengeance"]
R = ursgal.UController(
profile="LTQ XL low res",
params={
"database": os.path.join(os.pardir, "example_data", "BSA.fasta"),
"modifications": [
"M,opt,any,Oxidation", # Met oxidation
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
"*,opt,Prot-N-term,Acetyl", # N-Acteylation
],
},
)
mzML_file = os.path.join(
os.pardir, "example_data", "BSA_fragment_mass_tolerance_example", "BSA1.mzML"
)
if os.path.exists(mzML_file) is False:
R.params[
"http_url"
] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
R.params["http_output_folder"] = os.path.dirname(mzML_file)
R.fetch_file(engine="get_http_files_1_0_0")
try:
shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
except:
shutil.move("{0}format=raw".format(mzML_file), mzML_file)
# Convert mzML to MGF outside the loop, so this step is not repeated in
# the loop
mgf_file = R.convert(
input_file=mzML_file,
engine="mzml2mgf_1_0_0",
)
for engine in engine_list:
for fragment_mass_tolerance in fragment_mass_tolerance_list:
R.params["frag_mass_tolerance"] = fragment_mass_tolerance
R.params["prefix"] = "{0}_fragment_mass_tolerance_".format(
fragment_mass_tolerance
)
unified_search_result_file = R.search(
input_file=mgf_file,
engine=engine,
force=False,
)
collector = ddict(set)
for csv_path in glob.glob("{0}/*/*unified.csv".format(os.path.dirname(mzML_file))):
for line_dict in csv.DictReader(open(csv_path, "r")):
collector[csv_path].add(line_dict["Sequence"])
for csv_path, peptide_set in sorted(collector.items()):
file_name = os.path.basename(csv_path)
tolerance = file_name.split("_")[0]
print(
"Search with {0: <4} Da fragment mass tolerance found {1: >2} peptides".format(
tolerance, len(peptide_set)
)
)
return
if __name__ == "__main__":
main()
Bruderer et al. (2015) machine offset sweep (data for figure 2)¶
-
machine_offset_bruderer_sweep.
search
(input_folder=None)¶ Does the parameter sweep on every tenth MS2 spectrum of the data from Bruderer et al. (2015) und X!Tandem Sledgehammer.
Note
Please download the .RAW data for the DDA dataset from peptideatlas.org (PASS00589, password: WF6554orn) and convert to mzML. Then the script can be executed with the folder with the mzML files as the first argument.
Warning
This script (if the sweep ranges are not changed) will perform 10080 searches which will produce approximately 100 GB output (inclusive mzML files)
usage:
./machine_offset_bruderer_sweep.py <folder_with_bruderer_data>Sweeps over:
- machine_offset_in_ppm from -20 to +20 ppm offset
- precursor mass tolerance from 1 to 5 ppm
- fragment mass tolerance from -2.5 to 20 ppm
The search can be very time consuming (depending on your machine/cluster), therefor the analyze step can be performed separately by calling analyze() instead of search() when one has already performed the searches and wants to analyze the results.
-
machine_offset_bruderer_sweep.
analyze
(folder)¶ Parses the result files form search and write a result .csv file which contains the data to plot figure 2.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import csv
import os
from collections import defaultdict as ddict
import sys
import re
MQ_OFFSET_TO_FILENAME = [
(4.71, "B_D140314_SGSDSsample2_R01_MSG_T0.mzML"),
(4.98, "B_D140314_SGSDSsample6_R01_MSG_T0.mzML"),
(4.9, "B_D140314_SGSDSsample1_R01_MSG_T0.mzML"),
(5.41, "B_D140314_SGSDSsample4_R01_MSG_T0.mzML"),
(5.78, "B_D140314_SGSDSsample5_R01_MSG_T0.mzML"),
(6.01, "B_D140314_SGSDSsample8_R01_MSG_T0.mzML"),
(6.22, "B_D140314_SGSDSsample7_R01_MSG_T0.mzML"),
(6.83, "B_D140314_SGSDSsample3_R01_MSG_T0.mzML"),
(7.61, "B_D140314_SGSDSsample4_R02_MSG_T0.mzML"),
(7.59, "B_D140314_SGSDSsample8_R02_MSG_T0.mzML"),
(7.93, "B_D140314_SGSDSsample6_R02_MSG_T0.mzML"),
(7.91, "B_D140314_SGSDSsample1_R02_MSG_T0.mzML"),
(8.23, "B_D140314_SGSDSsample3_R02_MSG_T0.mzML"),
(8.33, "B_D140314_SGSDSsample7_R02_MSG_T0.mzML"),
(9.2, "B_D140314_SGSDSsample5_R02_MSG_T0.mzML"),
(9.4, "B_D140314_SGSDSsample2_R02_MSG_T0.mzML"),
(9.79, "B_D140314_SGSDSsample1_R03_MSG_T0.mzML"),
(10.01, "B_D140314_SGSDSsample3_R03_MSG_T0.mzML"),
(10.03, "B_D140314_SGSDSsample7_R03_MSG_T0.mzML"),
(10.58, "B_D140314_SGSDSsample2_R03_MSG_T0.mzML"),
(11.1, "B_D140314_SGSDSsample4_R03_MSG_T0.mzML"),
(11.21, "B_D140314_SGSDSsample5_R03_MSG_T0.mzML"),
(11.45, "B_D140314_SGSDSsample6_R03_MSG_T0.mzML"),
(12.19, "B_D140314_SGSDSsample8_R03_MSG_T0.mzML"),
]
GENERAL_PARAMS = {
"database": os.path.join(
os.pardir, "example_data", "hs_201303_qs_sip_target_decoy.fasta"
),
"modifications": [
"C,fix,any,Carbamidomethyl", # Carbamidomethylation
],
"scan_skip_modulo_step": 10,
"http_url": "http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
def search(input_folder=None):
"""
Does the parameter sweep on every tenth MS2 spectrum of the data from
Bruderer et al. (2015) und X!Tandem Sledgehammer.
Note:
Please download the .RAW data for the DDA dataset from peptideatlas.org
(PASS00589, password: WF6554orn) and convert to mzML.
Then the script can be executed with the folder with the mzML files as
the first argument.
Warning:
This script (if the sweep ranges are not changed) will perform 10080
searches which will produce approximately 100 GB output (inclusive mzML
files)
usage:
./machine_offset_bruderer_sweep.py <folder_with_bruderer_data>
Sweeps over:
* machine_offset_in_ppm from -20 to +20 ppm offset
* precursor mass tolerance from 1 to 5 ppm
* fragment mass tolerance from -2.5 to 20 ppm
The search can be very time consuming (depending on your machine/cluster),
therefor the analyze step can be performed separately by calling analyze()
instead of search() when one has already performed the searches and wants
to analyze the results.
"""
file_2_target_offset = {}
for MQ_offset, file_name in MQ_OFFSET_TO_FILENAME:
file_2_target_offset[file_name] = {"offsets": []}
for n in range(-20, 20, 2):
file_2_target_offset[file_name]["offsets"].append(n)
engine_list = ["xtandem_sledgehammer"]
frag_ion_tolerance_list = [2.5, 5, 10, 20]
precursor_ion_tolerance_list = [1, 2, 3, 4, 5]
R = ursgal.UController(profile="QExactive+", params=GENERAL_PARAMS)
if os.path.exists(R.params["database"]) is False:
R.fetch_file(engine="get_http_files_1_0_0")
prefix_format_string = "_pit_{1}_fit_{2}"
for mzML_path in glob.glob(os.path.join(input_folder, "*.mzML")):
mzML_basename = os.path.basename(mzML_path)
if mzML_basename not in file_2_target_offset.keys():
continue
for ppm_offset in file_2_target_offset[mzML_basename]["offsets"]:
R.params["machine_offset_in_ppm"] = ppm_offset
R.params["prefix"] = "ppm_offset_{0}".format(int(ppm_offset))
mgf_file = R.convert(input_file=mzML_path, engine="mzml2mgf_1_0_0")
for engine in engine_list:
for precursor_ion_tolerane in precursor_ion_tolerance_list:
for frag_ion_tolerance in frag_ion_tolerance_list:
new_prefix = prefix_format_string.format(
precursor_ion_tolerane, frag_ion_tolerance
)
R.params[
"precursor_mass_tolerance_minus"
] = precursor_ion_tolerane
R.params[
"precursor_mass_tolerance_plus"
] = precursor_ion_tolerane
R.params["frag_mass_tolerance"] = frag_ion_tolerance
R.params["prefix"] = new_prefix
unified_search_result_file = R.search(
input_file=mzML_path,
engine=engine,
force=False,
)
return
def analyze(folder):
"""
Parses the result files form search and write a result .csv file which
contains the data to plot figure 2.
"""
R = ursgal.UController(profile="QExactive+", params=GENERAL_PARAMS)
csv_collector = {}
ve = "qvality_2_02"
sample_regex_pattern = "sample\d_R0\d"
sample_2_x_pos_and_mq_offset = {}
sample_offset_combos = []
all_tested_offsets = [str(n) for n in range(-20, 21, 2)]
for pos, (mq_ppm_off, mzML_file) in enumerate(MQ_OFFSET_TO_FILENAME):
_sample = re.search(sample_regex_pattern, mzML_file).group()
sample_2_x_pos_and_mq_offset[_sample] = (pos, mq_ppm_off)
for theo_offset in all_tested_offsets:
sample_offset_combos.append((_sample, theo_offset))
for csv_path in glob.glob(os.path.join("{0}".format(folder), "*", "*_unified.csv")):
dirname = os.path.dirname(csv_path)
sample = re.search(sample_regex_pattern, csv_path).group()
splitted_basename = os.path.basename(csv_path).split("_")
offset = splitted_basename[2]
precursor_ion_tolerance = splitted_basename[4]
frag_ion_tolerance = splitted_basename[6]
prefix = "_".join(splitted_basename[:7])
R.params["machine_offset_in_ppm"] = offset
R.params["precursor_mass_tolerance_minus"] = precursor_ion_tolerance
R.params["precursor_mass_tolerance_plus"] = precursor_ion_tolerance
R.params["frag_mass_tolerance"] = frag_ion_tolerance
R.params["prefix"] = prefix
validated_path = csv_path.replace(
"_unified.csv", "_{0}_validated.csv".format(ve)
)
if os.path.exists(validated_path):
csv_path = validated_path
else:
try:
csv_path = R.validate(input_file=csv_path, engine=ve)
except:
continue
pit_fit = (precursor_ion_tolerance, frag_ion_tolerance)
if pit_fit not in csv_collector.keys():
csv_collector[pit_fit] = ddict(set)
csv_key = (sample, offset)
print("Reading file: {0}".format(csv_path))
for line_dict in csv.DictReader(open(csv_path, "r")):
if line_dict["Is decoy"] == "true":
continue
if float(line_dict["PEP"]) <= 0.01:
csv_collector[pit_fit][csv_key].add(
"{0}{1}".format(line_dict["Sequence"], line_dict["Modifications"])
)
fieldnames = ["Sample", "pos", "MQ_offset", "tested_ppm_offset", "peptide_count"]
outfile_name_format_string = "bruderer_data_ppm_sweep_precursor_mass_tolerance_{0}_fragment_mass_tolerance_{1}.csv"
for pit_fit in csv_collector.keys():
with open(outfile_name_format_string.format(*pit_fit), "w") as io:
csv_writer = csv.DictWriter(io, fieldnames)
csv_writer.writeheader()
# write missing values
for sample_offset in sample_offset_combos:
sample, ppm_offset = sample_offset
if sample_offset not in csv_collector[pit_fit].keys():
dict_2_write = {
"Sample": sample,
"pos": sample_2_x_pos_and_mq_offset[sample][0],
"MQ_offset": "",
"tested_ppm_offset": ppm_offset,
"peptide_count": 0,
}
csv_writer.writerow(dict_2_write)
for (sample, ppm_offset), peptide_set in csv_collector[pit_fit].items():
dict_2_write = {
"Sample": sample,
"pos": sample_2_x_pos_and_mq_offset[sample][0],
"MQ_offset": sample_2_x_pos_and_mq_offset[sample][1] * -1,
"tested_ppm_offset": ppm_offset,
"peptide_count": len(peptide_set),
}
csv_writer.writerow(dict_2_write)
return
if __name__ == "__main__":
if len(sys.argv) < 2:
print(search.__doc__)
sys.exit(1)
search(sys.argv[1])
analyze(sys.argv[1])
Filter CSV Examples¶
Filter for modifications¶
-
filter_csv_for_mods_example.
main
()¶ Examples script for filtering unified results for modification containing entries
- usage:
- ./filter_csv_for_mods_example.py
Will produce a file with only entries which contain Carbamidomethyl as a modification.
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Examples script for filtering unified results for modification containing
entries
usage:
./filter_csv_for_mods_example.py
Will produce a file with only entries which contain Carbamidomethyl as a
modification.
"""
params = {
"csv_filter_rules": [
["Modifications", "contains", "Carbamidomethyl"],
],
"write_unfiltered_results": False,
}
csv_file_to_filter = os.path.join(
os.pardir,
"example_data",
"misc",
"filter_csv_for_mods_example_omssa_2_1_9_pmap_unified.csv",
)
uc = ursgal.UController(params=params)
filtered_csv = uc.execute_misc_engine(
input_file=csv_file_to_filter,
engine="filter_csv_1_0_0",
)
if __name__ == "__main__":
main()
Filter validated results¶
-
filter_csv_validation_example.
main
()¶ Examples script for filtering validated results for a PEP <= 0.01 and remove all decoys.
- usage:
- ./filter_csv_validation_example.py
Will produce a file with only target sequences with a posterior error probability of lower or equal to 1 percent
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import os
def main():
"""
Examples script for filtering validated results for a PEP <= 0.01 and
remove all decoys.
usage:
./filter_csv_validation_example.py
Will produce a file with only target sequences with a posterior error
probability of lower or equal to 1 percent
"""
params = {
"csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]]
}
csv_file_to_filter = os.path.join(
os.pardir,
"example_data",
"misc",
"filter_csv_for_mods_example_omssa_2_1_9_pmap_unified_percolator_2_08_validated.csv",
)
uc = ursgal.UController(params=params)
filtered_csv = uc.execute_misc_engine(
input_file=csv_file_to_filter,
engine="filter_csv_1_0_0",
)
if __name__ == "__main__":
main()
SugarPy Example Scripts¶
SugarPy complete workflow¶
-
sugarpy_example_workflow.
main
(folder=None, target_decoy_database=None)¶ Complete example workflow for the identification of intact glycopeptides from MS runs employing IS-CID.
- usage:
- ./do_it_all_folder_wide.py <mzML_folder> <target_decoy_database>
#!/usr/bin/env python3.4
# encoding: utf-8
import ursgal
import sys
import glob
import os
import shutil
def main(folder=None, target_decoy_database=None):
'''
Complete example workflow for the identification of intact glycopeptides
from MS runs employing IS-CID.
usage:
./do_it_all_folder_wide.py <mzML_folder> <target_decoy_database>
'''
# define folder with mzML_files as sys.argv[1]
mzML_files = []
for mzml in glob.glob(os.path.join('{0}'.format(folder), '*.mzML')):
mzML_files.append(mzml)
mass_spectrometer = 'QExactive+'
# Define search engines for protein database search
search_engines = [
'xtandem_vengeance',
'msgfplus_v2019_07_03',
'msfragger_2_3',
]
# Define validation engine for protein database search
validation_engine = 'percolator_3_4_0'
# Modifications that should be included in the search.
# Glycan modifications will be added later.
all_mods = [
'C,fix,any,Carbamidomethyl',
'M,opt,any,Oxidation',
'*,opt,Prot-N-term,Acetyl',
]
# Initializing the Ursgal UController class with
# our specified modifications and mass spectrometer
params = {
'cpus':8,
'database' : target_decoy_database,
'modifications' : all_mods,
'csv_filter_rules' : [
['Is decoy', 'equals', 'false'],
['PEP', 'lte', 0.01],
['Conflicting uparam', 'contains_not', 'enzyme'],
],
'peptide_mapper_class_version': 'UPeptideMapper_v4',
'use_pyqms_for_mz_calculation': True,
'-xmx' : '16g',
'upper_mz_limit': 3000,
'lower_mz_limit': 500,
'precursor_mass_tolerance_plus': 5,
'precursor_mass_tolerance_minus': 5,
'frag_mass_tolerance': 20,
}
uc = ursgal.UController(
profile=mass_spectrometer,
params=params
)
# complete workflow
result_files_unfilered = []
glyco_result_files = []
pot_glyco_result_files = []
sugarpy_result_files = []
sugarpy_curated_files = []
for spec_file in mzML_files:
mgf_file = uc.convert(
input_file=spec_file,
engine='mzml2mgf_2_0_0'
)
results_one_file = []
for search_engine in search_engines:
# search MS2 specs for peptides with HexNAc and HexNAc(2) modification
for mod in [
'N,opt,any,HexNAc',
'N,opt,any,HexNAc(2)'
]:
uc.params['modifications'].append(mod)
uc.params['prefix'] = mod.split(',')[3]
search_result = uc.search_mgf(
input_file=mgf_file,
engine=search_engine
)
uc.params['prefix'] = ''
converted_result = uc.convert(
input_file=search_result,
guess_engine=True,
)
mapped_results = uc.execute_misc_engine(
input_file=converted_result,
engine='upeptide_mapper'
)
unified_search_results = uc.execute_misc_engine(
input_file=mapped_results,
engine='unify_csv',
)
validated_csv = uc.validate(
input_file=unified_search_results,
engine=validation_engine,
)
filtered_validated_results = uc.execute_misc_engine(
input_file=validated_csv,
engine='filter_csv',
)
results_one_file.append(filtered_validated_results)
uc.params['modifications'].remove(mod)
uc.params['prefix'] = '{0}'.format(
os.path.basename(spec_file).strip('.mzML'))
all_results_one_file = uc.execute_misc_engine(
input_file=results_one_file,
engine='merge_csvs',
)
result_files_unfilered.append(all_results_one_file)
# filter for glycopeptides
uc.params.update({
'prefix': 'Glyco_',
'csv_filter_rules': [
['Modifications', 'contains', 'HexNAc'],
['Modifications', 'mod_at_glycosite', 'HexNAc'],
]
})
glyco_file = uc.execute_misc_engine(
input_file=all_results_one_file,
engine='filter_csv',
)
glyco_result_files.append(glyco_file)
# sugarpy_run to match glycopeptide fragments in MS1 scans.
# Modifiy those parameters according your MS specifications
uc.params.update({
'prefix': '',
'mzml_input_file': spec_file,
'min_number_of_spectra': 2,
'min_glycan_length': 3,
'max_glycan_length': 20,
'min_subtree_coverage': 0.65,
'min_sugarpy_score': 1,
'ms_level': 1,
'rt_border_tolerance': 1,
'precursor_max_charge': 5,
'use_median_accuracy': 'peptide',
'monosaccharide_compositions': {
"dHex": 'C6H10O4',
"Hex": 'C6H10O5',
"HexNAc": 'C8H13NO5',
"NeuAc": 'C11H17NO8',
# "dHexNAc": 'C8H13NO4',
# "HexA": 'C6H8O6',
# "Me2Hex": 'C8H14O5',
# "MeHex": 'C7H12O5',
# "Pent": 'C5H8O4',
# 'dHexN': 'C6H11O3N',
# 'HexN': 'C6H11O4N',
# 'MeHexA': 'C7H10O6',
},
'm_score_cutoff': 0.7,
'mz_score_percentile': 0.7,
'precursor_mass_tolerance_plus': 10,
'precursor_mass_tolerance_minus': 10,
'rel_intensity_range': 0.2,
'rt_pickle_name': os.path.join(
os.path.dirname(spec_file),
'_ursgal_lookup.pkl'
)
})
sugarpy_results = uc.execute_misc_engine(
input_file=glyco_file,
engine='sugarpy_run_1_0_0',
)
sugarpy_result_files.append(sugarpy_results)
# sugarpy_plot to plot the results from the sugarpy_run
# This will plot an elution profile for all identified glycopeptides
uc.params.update({
'sugarpy_results_pkl': sugarpy_results.replace('.csv', '.pkl'),
'sugarpy_results_csv': sugarpy_results,
'sugarpy_plot_types': [
'plot_glycan_elution_profile',
],
'plotly_layout': {
'font' : {
'family':'Arial',
# 'size': 20,
'color' : 'rgb(0, 0, 0)',
},
'autosize':True,
# 'width':1700,
# 'height':700,
# 'margin':{
# 'l':100,
# 'r':80,
# 't':100,
# 'b':60,
# },
'xaxis':{
'type':'linear',
'color':'rgb(0, 0, 0)',
'title_font':{
'family':'Arial',
'size':20,
'color':'rgb(0, 0, 0)',
},
'autorange':True,
# 'range':[0,1000.0],
'tickmode':'auto',
'showexponent':'all',
'exponentformat':'B',
'ticklen':5,
'tickwidth':1,
'tickcolor':'rgb(0, 0, 0)',
'ticks':'outside',
'showline':True,
'linecolor':'rgb(0, 0, 0)',
'mirror':False,
'showgrid':False,
},
'yaxis':{
'type':'linear',
'color':'rgb(0, 0, 0)',
'title':'Intensity',
'title_font':{
'family':'Arial',
'size':20,
'color':'rgb(0, 0, 0)',
},
'autorange':True,
# 'range':[0.0,100.0],
'showexponent':'all',
'exponentformat':'B',
'ticklen':5,
'tickwidth':1,
'tickcolor':'rgb(0, 0, 0)',
'ticks':'outside',
'showline':True,
'linecolor':'rgb(0, 0, 0)',
'mirror':False,
'showgrid':False,
},
'legend':{
'orientation':'h',
'traceorder':'normal',
},
'showlegend':True,
'paper_bgcolor':'rgba(0,0,0,0)',
'plot_bgcolor':'rgba(0,0,0,0)',
},
})
sugarpy_plots = uc.execute_misc_engine(
input_file=spec_file,
engine='sugarpy_plot_1_0_0',
# force = True,
)
# setting uparams back to original
uc.params.update({
'prefix': '',
'ms_level': 2,
'precursor_max_charge': 5,
'csv_filter_rules': [
['Is decoy', 'equals', 'false'],
['PEP', 'lte', 0.01],
],
'precursor_mass_tolerance_plus' : 5,
'precursor_mass_tolerance_minus': 5,
'rt_pickle_name': '_ursgal_lookup.pkl',
'frag_mass_tolerance': 20,
# 'rt_pickle_name': os.path.join(
# os.path.dirname(spec_file),
# '_ursgal_lookup.pkl'
# )
})
# combine results from multiple mzML files
results_all_files_unfiltered = uc.execute_misc_engine(
input_file=result_files_unfilered,
engine='merge_csvs',
)
glyco_results_all_files = uc.execute_misc_engine(
input_file=glyco_result_files,
engine='merge_csvs',
)
sugarpy_results_all_files = uc.execute_misc_engine(
input_file=sugarpy_result_files,
engine='merge_csvs',
)
# Extract the best matches from the raw SugarPy results.
# This is used for the automatic filtering described in the manuscript.
uc.params.update({
'prefix': '',
'ms_level': 1,
'min_number_of_spectra': 2,
'max_trees_per_spec': 1,
'sugarpy_results_pkl': None,
'sugarpy_results_csv': sugarpy_results_all_files,
'sugarpy_plot_types': [
'extract_best_matches',
],
'rt_pickle_name': 'create_new_lookup',
})
sugarpy_extracted = uc.execute_misc_engine(
# the specific spec file is not important here,
# this is just used to localize the scan2rt lookup
input_file=spec_file,
engine='sugarpy_plot_1_0_0',
force = False,
)
# Filter extracted results for glycopeptides that were identified
# in at least two replicates (i.e. two MS files)
uc.params['csv_filter_rules'] = [
['Num Raw Files', 'gte', 2],
]
extracted_filtered = uc.execute_misc_engine(
input_file=sugarpy_results_all_files.replace('.csv', '_extracted.csv'),
engine='filter_csv',
)
if __name__ == '__main__':
if len(sys.argv) < 3:
print(main.__doc__)
exit()
main(
folder=sys.argv[1],
target_decoy_database=sys.argv[2],
)
SugarPy plot annotated specs¶
-
sugarpy_plot_annotated_specs.
main
(spec_file=None, sugarpy_results=None, sugarpy_results_pkl=None)¶ This script can be used to plot any spectrum from SugarPy results. It woll plot all spectra listed in the input file (sugarpy_results), therefore it is recommended to provide a csv file with only a few, representative spectra for each glycopeptide. Besides the mzml file corresponding to the glycopeptide identifications, the sugarpy results csv file as well as the corresponding pkl file is required.
- Usage:
- ./sugarpy_plot_annotated_specs.py <mzml.idx.gz> <sugarpy_results.csv> <sugarpy_results_pkl>
#!/usr/bin/env python3.4
# encoding: utf-8
import ursgal
import sys
import glob
import os
import shutil
def main(spec_file=None, sugarpy_results=None, sugarpy_results_pkl=None):
'''
This script can be used to plot any spectrum from SugarPy results.
It woll plot all spectra listed in the input file (sugarpy_results),
therefore it is recommended to provide a csv file with only a few,
representative spectra for each glycopeptide.
Besides the mzml file corresponding to the glycopeptide identifications,
the sugarpy results csv file as well as the corresponding pkl file is required.
Usage:
./sugarpy_plot_annotated_specs.py <mzml.idx.gz> <sugarpy_results.csv> <sugarpy_results_pkl>
'''
# initialize the UController
uc = ursgal.UController()
# sugarpy_plot to plot he results from the sugarpy_run
uc.params.update({
'prefix': 'Annotated_specs',
'sugarpy_results_pkl': sugarpy_results_pkl,
'sugarpy_results_csv': sugarpy_results,
'sugarpy_plot_types': [
'plot_annotated_spectra',
],
'sugarpy_plot_peak_types': [
'matched',
'unmatched',
'labels',
],
# 'sugarpy_include_subtrees': 'individual_subtrees'
'rt_pickle_name': os.path.join(
os.path.dirname(spec_file),
'_ursgal_lookup.pkl'
),
'ms_level': 1,
'plotly_layout': {
'font' : {
'family':'Arial',
'size': 26,
'color' : 'rgb(0, 0, 0)',
},
# 'autosize':True,
'width':1750,
'height':820,
'margin':{
'l':120,
'r':50,
't':50,
'b':170,
},
'xaxis':{
'type':'linear',
'color':'rgb(0, 0, 0)',
'title': 'm/z',
'title_font':{
'family':'Arial',
'size':26,
'color':'rgb(0, 0, 0)',
},
'autorange':True,
# 'range':[1000,1550.0],
'tickmode':'auto',
'showticklabels':True,
'tickfont':{
'family':'Arial',
'size':22,
'color':'rgb(0, 0, 0)',
},
'showexponent':'all',
'exponentformat':'B',
'ticklen':5,
'tickwidth':1,
'tickcolor':'rgb(0, 0, 0)',
'ticks':'outside',
'showline':True,
'linecolor':'rgb(0, 0, 0)',
'showgrid':False,
'dtick':100,
},
'yaxis':{
'type':'linear',
'color':'rgb(0, 0, 0)',
'title':'Intensity',
'title_font':{
'family':'Arial',
'size':26,
'color':'rgb(0, 0, 0)',
},
'autorange':True,
# 'range':[0.0,25.0],
'showticklabels':True,
'tickfont':{
'family':'Arial',
'size':22,
'color':'rgb(0, 0, 0)',
},
'tickangle':0,
'showexponent':'all',
'exponentformat':'B',
'ticklen':5,
'tickwidth':1,
'tickcolor':'rgb(0, 0, 0)',
'ticks':'outside',
'showline':True,
'linecolor':'rgb(0, 0, 0)',
'showgrid':False,
'zeroline':False,
},
# 'legend':{
# 'font':{
# 'family':'Arial',
# 'size':10,
# 'color':'rgb(0, 0, 0)',
# },
# 'orientation':'v',
# 'traceorder':'normal',
# },
'showlegend':False,
'paper_bgcolor':'rgba(0,0,0,0)',
'plot_bgcolor':'rgba(0,0,0,0)',
},
})
sugarpy_plots = uc.execute_misc_engine(
input_file=spec_file,
engine='sugarpy_plot_1_0_0',
force = True,
)
if __name__ == '__main__':
main(
spec_file=sys.argv[1],
sugarpy_results=sys.argv[2],
)
Cascade Search¶
Cascade search example¶
-
cascade_search_example.
search
(validation_engine)¶ Executes a cascade search on four example files from the data from Barth et al.
- usage:
- ./cascade_search_example.py
Searches for peptides using a cascade search approach similar to Kertesz-Farkas et al. for which spectra were first searched for unmodified peptides, followed by consecutive searches for the following modifications: oxidation of M, deamidation of N/Q, methylation of E/K/R, N-terminal acetylation, phosphorylation of S/T. After each step, spectra with a PSM below 1 % PEP were removed.
-
cascade_search_example.
analyze
(collector)¶ Simle analysis script for the cascade search, counting the number of identified peptides (combination of peptide sequence and modifications) and PSMs (additionally include the spectrum ID)
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import csv
from collections import defaultdict as ddict
import os
import glob
import math
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
],
}
# We specify all search engines and validation engines that we want
# to use in a list (version numbers might differ on windows or mac):
search_engines = [
"omssa",
"xtandem_piledriver",
"msgfplus_v9979",
# 'myrimatch_2_1_138',
# 'msamanda_1_0_0_5243',
]
validation_engines = [
"percolator_2_08",
"qvality",
]
# The different levels with different modifications
# for the cascade are defined
cascade = {
"0": ["C,fix,any,Carbamidomethyl"],
"1": ["C,fix,any,Carbamidomethyl", "N,opt,any,Deamidated", "Q,opt,any,Deamidated"],
"2": ["C,fix,any,Carbamidomethyl", "*,opt,Prot-N-term,Acetyl"],
"3": ["C,fix,any,Carbamidomethyl", "M,opt,any,Oxidation"],
"4": [
"C,fix,any,Carbamidomethyl",
"E,opt,any,Methyl",
"K,opt,any,Methyl",
"R,opt,any,Methyl",
],
"5": ["C,fix,any,Carbamidomethyl", "S,opt,any,Phospho", "T,opt,any,Phospho"],
}
mass_spectrometer = "LTQ XL low res"
get_params = {
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
"JB_FASP_pH8_2-4_28122012.mzML",
"JB_FASP_pH8_3-1_28122012.mzML",
"JB_FASP_pH8_4-1_28122012.mzML",
],
"ftp_output_folder": os.path.join(os.pardir, "example_data", "cascade_search"),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
def get_files():
uc = ursgal.UController(params=get_params)
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
if os.path.exists(get_params["ftp_output_folder"]) is False:
os.makedirs(get_params["ftp_output_folder"])
uc.fetch_file(engine="get_ftp_files_1_0_0")
spec_files = []
for mzML_file in glob.glob(os.path.join(get_params["ftp_output_folder"], "*.mzML")):
spec_files.append(mzML_file)
return spec_files
def search(validation_engine):
"""
Executes a cascade search on four example files from the
data from Barth et al.
usage:
./cascade_search_example.py
Searches for peptides using a cascade search approach similar to Kertesz-Farkas et al.
for which spectra were first searched for unmodified peptides, followed by consecutive searches
for the following modifications:
oxidation of M,
deamidation of N/Q,
methylation of E/K/R,
N-terminal acetylation,
phosphorylation of S/T.
After each step, spectra with a PSM below 1 % PEP were removed.
"""
# Initializing the uPLANIT UController class with
# our specified modifications and mass spectrometer
uc = ursgal.UController(
profile=mass_spectrometer, params=params # 'LTQ XL low res' profile!
)
# complete workflow for every level of the cascade:
# every spectrum file is searched with every search engine,
# results are validated seperately,
# validated results are merged and filtered for targets and PEP <= 0.01.
def workflow(
spec_file,
prefix=None,
validation_engine=None,
filter_before_validation=False,
force=False,
):
validated_results = []
# Convert mzML to MGF outside the loop, so this step is not repeated in
# the loop
mgf_spec_file = uc.convert(input_file=spec_file, engine="mzml2mgf_1_0_0")
for search_engine in search_engines:
uc.params["prefix"] = prefix
unified_search_results = uc.search(
input_file=mgf_spec_file,
engine=search_engine,
force=force,
)
uc.params["prefix"] = ""
if filter_before_validation == True:
uc.params["csv_filter_rules"] = [
[
"Modifications",
"contains",
"{0}".format(cascade[level][1].split(",")[3]),
]
]
filtered_search_results = uc.execute_misc_engine(
input_file=unified_search_results, engine="filter_csv_1_0_0"
)
else:
filtered_search_results = unified_search_results
validated_search_results = uc.validate(
input_file=filtered_search_results,
engine=validation_engine,
force=force,
)
validated_results.append(validated_search_results)
validated_results_from_all_engines = uc.execute_misc_engine(
input_file=sorted(validated_results),
engine="merge_csvs_1_0_0",
force=force,
)
uc.params["csv_filter_rules"] = [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
]
filtered_validated_results = uc.execute_misc_engine(
input_file=validated_results_from_all_engines, engine="filter_csv_1_0_0"
)
return filtered_validated_results
result_files = []
for spec_file in spec_files:
spectra_with_PSM = set()
for level in sorted(cascade.keys()):
uc.params["modifications"] = cascade[level]
if level == "0":
results = workflow(
spec_file,
validation_engine=validation_engine,
prefix="cascade-lvl-{0}".format(level),
)
else:
uc.params["scan_exclusion_list"] = list(spectra_with_PSM)
results = workflow(
spec_file,
validation_engine=validation_engine,
filter_before_validation=True,
force=True,
prefix="cascade-lvl-{0}".format(level),
)
result_files.append(results)
# spectrum IDs for PSMs are written into an exclusion list for the next level of the cascade search,
# these spectra will b excluded during mzml2mgf conversion
with open(results) as in_file:
csv_input = csv.DictReader(in_file)
for line_dict in csv_input:
spectra_with_PSM.add(line_dict["Spectrum ID"])
print(
"Number of spectra that will be removed for the next cacade level: {0}".format(
len(spectra_with_PSM)
)
)
results_all_files = uc.execute_misc_engine(
input_file=sorted(result_files),
engine="merge_csvs_1_0_0",
)
return results_all_files
def analyze(collector):
"""
Simle analysis script for the cascade search,
counting the number of identified peptides (combination of peptide sequence and modifications)
and PSMs (additionally include the spectrum ID)
"""
mod_list = ["Oxidation", "Deamidated", "Methyl", "Acetyl", "Phospho"]
fieldnames = (
["approach", "count_type", "validation_engine", "unmodified", "multimodified"]
+ mod_list
+ ["total"]
)
csv_writer = csv.DictWriter(open("cascade_results.csv", "w"), fieldnames)
csv_writer.writeheader()
uc = ursgal.UController()
uc.params["validation_score_field"] = "PEP"
uc.params["bigger_scores_better"] = False
# Count the number of identified peptides and PSMs for the different modifications
# Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted
# and only if the best hit has a PEP that is at least two orders of
# magnitude smaller than the others
for validation_engine, result_file in collector.items():
counter_dict = {"psm": ddict(set), "pep": ddict(set)}
grouped_psms = uc._group_psms(
result_file, validation_score_field="PEP", bigger_scores_better=False
)
for spec_title, grouped_psm_list in grouped_psms.items():
best_score, best_line_dict = grouped_psm_list[0]
if len(grouped_psm_list) > 1:
second_best_score, second_best_line_dict = grouped_psm_list[1]
best_peptide_and_mod = (
best_line_dict["Sequence"] + best_line_dict["Modifications"]
)
second_best_peptide_and_mod = (
second_best_line_dict["Sequence"]
+ second_best_line_dict["Modifications"]
)
if best_peptide_and_mod == second_best_peptide_and_mod:
line_dict = best_line_dict
elif best_line_dict["Sequence"] == second_best_line_dict["Sequence"]:
if best_score == second_best_score:
line_dict = best_line_dict
else:
if (-1 * math.log10(best_score)) - (
-1 * math.log10(second_best_score)
) >= 2:
line_dict = best_line_dict
else:
continue
else:
if (-1 * math.log10(best_score)) - (
-1 * math.log10(second_best_score)
) >= 2:
line_dict = best_line_dict
else:
continue
else:
line_dict = best_line_dict
count = 0
for mod in mod_list:
if mod in line_dict["Modifications"]:
count += 1
key_2_add = ""
if count == 0:
key_2_add = "unmodified"
elif count >= 2:
key_2_add = "multimodified"
elif count == 1:
for mod in mod_list:
if mod in line_dict["Modifications"]:
key_2_add = mod
break
# for peptide identification comparison
counter_dict["pep"][key_2_add].add(
line_dict["Sequence"] + line_dict["Modifications"]
)
# for PSM comparison
counter_dict["psm"][key_2_add].add(
line_dict["Spectrum Title"]
+ line_dict["Sequence"]
+ line_dict["Modifications"]
)
for counter_key, count_dict in counter_dict.items():
dict_2_write = {
"approach": "cascade",
"count_type": counter_key,
"validation_engine": validation_engine,
}
total_number = 0
for key, obj_set in count_dict.items():
dict_2_write[key] = len(obj_set)
total_number += len(obj_set)
dict_2_write["total"] = total_number
csv_writer.writerow(dict_2_write)
return
if __name__ == "__main__":
spec_files = get_files()
collector = {}
for validation_engine in validation_engines:
results_all_files = search(validation_engine)
print(
">>> ",
"final results for {0}".format(validation_engine),
" were written into:",
)
print(">>> ", results_all_files)
collector[validation_engine] = results_all_files
analyze(collector)
print(">>> ", "number of identified peptides and PSMs were written into:")
print(">>> ", "cascade_results.csv")
Ungrouped search for comparison¶
-
ungrouped_search_example.
search
(validation_engine)¶ Executes an ungrouped search on four example files from the data from Barth et al.
- usage:
- ./ungrouped_search_example.py
Searches for peptides including the following potential modifications: oxidation of M, deamidation of N/Q, methylation of E/K/R, N-terminal acetylation, phosphorylation of S/T.
All modifications are validated together with unmodified peptides.
-
ungrouped_search_example.
analyze
(collector)¶ Simple analysis script for the ungrouped search, counting the number of identified peptides (combination of peptide sequence and modifications) and PSMs (additionally include the spectrum ID)
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import csv
from collections import defaultdict as ddict
import os
import glob
import math
import sys
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
],
# Modifications that should be included in the search
"modifications": [
"C,fix,any,Carbamidomethyl",
"M,opt,any,Oxidation",
"N,opt,any,Deamidated",
"Q,opt,any,Deamidated",
"E,opt,any,Methyl",
"K,opt,any,Methyl",
"R,opt,any,Methyl",
"*,opt,Prot-N-term,Acetyl",
"S,opt,any,Phospho",
"T,opt,any,Phospho",
],
}
# We specify all search engines and validation engines that we want
# to use in a list (version numbers might differ on windows or mac):
search_engines = [
"omssa",
"xtandem_piledriver",
"msgfplus_v9979",
# 'myrimatch_2_1_138',
"msamanda_1_0_0_5243",
]
validation_engines = [
"percolator_2_08",
"qvality",
]
mass_spectrometer = "LTQ XL low res"
get_params = {
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
"JB_FASP_pH8_2-4_28122012.mzML",
"JB_FASP_pH8_3-1_28122012.mzML",
"JB_FASP_pH8_4-1_28122012.mzML",
],
"ftp_output_folder": os.path.join(os.pardir, "example_data", "ungrouped_search"),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
def get_files():
uc = ursgal.UController(params=get_params)
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
if os.path.exists(get_params["ftp_output_folder"]) is False:
os.makedirs(get_params["ftp_output_folder"])
uc.fetch_file(engine="get_ftp_files_1_0_0")
spec_files = []
for mzML_file in glob.glob(os.path.join(get_params["ftp_output_folder"], "*.mzML")):
spec_files.append(mzML_file)
return spec_files
def search(validation_engine):
"""
Executes an ungrouped search on four example files from the
data from Barth et al.
usage:
./ungrouped_search_example.py
Searches for peptides including the following potential modifications:
oxidation of M,
deamidation of N/Q,
methylation of E/K/R,
N-terminal acetylation,
phosphorylation of S/T.
All modifications are validated together with unmodified peptides.
"""
uc = ursgal.UController(
profile=mass_spectrometer, params=params # 'LTQ XL low res' profile!
)
# complete workflow:
# every spectrum file is searched with every search engine,
# results are validated seperately,
# validated results are merged and filtered for targets and PEP <= 0.01.
# In the end, all filtered results from all spectrum files are merged
# for validation_engine in validation_engines:
result_files = []
for spec_file in spec_files:
validated_results = []
for search_engine in search_engines:
unified_search_results = uc.search(
input_file=spec_file,
engine=search_engine,
)
validated_search_results = uc.validate(
input_file=unified_search_results,
engine=validation_engine,
)
validated_results.append(validated_search_results)
validated_results_from_all_engines = uc.execute_misc_engine(
input_file=sorted(validated_results),
engine="merge_csvs",
)
filtered_validated_results = uc.execute_misc_engine(
input_file=validated_results_from_all_engines, engine="filter_csv"
)
result_files.append(filtered_validated_results)
results_all_files = uc.execute_misc_engine(
input_file=sorted(result_files),
engine="merge_csvs",
)
return results_all_files
def analyze(collector):
"""
Simple analysis script for the ungrouped search,
counting the number of identified peptides (combination of peptide sequence and modifications)
and PSMs (additionally include the spectrum ID)
"""
mod_list = ["Oxidation", "Deamidated", "Methyl", "Acetyl", "Phospho"]
fieldnames = (
["approach", "count_type", "validation_engine", "unmodified", "multimodified"]
+ mod_list
+ ["total"]
)
csv_writer = csv.DictWriter(open("ungrouped_results.csv", "w"), fieldnames)
csv_writer.writeheader()
uc = ursgal.UController()
uc.params["validation_score_field"] = "PEP"
uc.params["bigger_scores_better"] = False
# Count the number of identified peptides and PSMs for the different modifications
# Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted
# and only if the best hit has a PEP that is at least two orders of
# magnitude smaller than the others
for validation_engine, result_file in collector.items():
counter_dict = {"psm": ddict(set), "pep": ddict(set)}
grouped_psms = uc._group_psms(
result_file, validation_score_field="PEP", bigger_scores_better=False
)
for spec_title, grouped_psm_list in grouped_psms.items():
best_score, best_line_dict = grouped_psm_list[0]
if len(grouped_psm_list) > 1:
second_best_score, second_best_line_dict = grouped_psm_list[1]
best_peptide_and_mod = (
best_line_dict["Sequence"] + best_line_dict["Modifications"]
)
second_best_peptide_and_mod = (
second_best_line_dict["Sequence"]
+ second_best_line_dict["Modifications"]
)
if best_peptide_and_mod == second_best_peptide_and_mod:
line_dict = best_line_dict
elif best_line_dict["Sequence"] == second_best_line_dict["Sequence"]:
if best_score == second_best_score:
line_dict = best_line_dict
else:
if (-1 * math.log10(best_score)) - (
-1 * math.log10(second_best_score)
) >= 2:
line_dict = best_line_dict
else:
continue
else:
if (-1 * math.log10(best_score)) - (
-1 * math.log10(second_best_score)
) >= 2:
line_dict = best_line_dict
else:
continue
else:
line_dict = best_line_dict
count = 0
for mod in mod_list:
if mod in line_dict["Modifications"]:
count += 1
key_2_add = ""
if count == 0:
key_2_add = "unmodified"
elif count >= 2:
key_2_add = "multimodified"
elif count == 1:
for mod in mod_list:
if mod in line_dict["Modifications"]:
key_2_add = mod
break
# for peptide identification comparison
counter_dict["pep"][key_2_add].add(
line_dict["Sequence"] + line_dict["Modifications"]
)
# for PSM comparison
counter_dict["psm"][key_2_add].add(
line_dict["Spectrum Title"]
+ line_dict["Sequence"]
+ line_dict["Modifications"]
)
for counter_key, count_dict in counter_dict.items():
dict_2_write = {
"approach": "ungrouped",
"count_type": counter_key,
"validation_engine": validation_engine,
}
total_number = 0
for key, obj_set in count_dict.items():
dict_2_write[key] = len(obj_set)
total_number += len(obj_set)
dict_2_write["total"] = total_number
csv_writer.writerow(dict_2_write)
return
if __name__ == "__main__":
spec_files = get_files()
collector = {}
for validation_engine in validation_engines:
results_all_files = search(validation_engine)
print(
">>> ",
"final results for {0}".format(validation_engine),
" were written into:",
)
print(">>> ", results_all_files)
collector[validation_engine] = results_all_files
analyze(collector)
print(">>> ", "number of identified peptides and PSMs were written into:")
print(">>> ", "ungrouped_results.csv")
Grouped search for comparison¶
-
grouped_search_example.
search
(validation_engine)¶ Executes a grouped search on four example files from the data from Barth et al.
- usage:
- ./grouped_search_example.py
Searches for peptides including the following potential modifications: oxidation of M, deamidation of N/Q, methylation of E/K/R, N-terminal acetylation, phosphorylation of S/T.
After the search, each type of modification is validated seperately.
-
grouped_search_example.
analyze
(collector)¶ Simle analysis script for the grouped search, counting the number of identified peptides (combination of peptide sequence and modifications) and PSMs (additionally include the spectrum ID)
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import csv
from collections import defaultdict as ddict
import os
import glob
import math
from collections import defaultdict as ddict
params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"csv_filter_rules": [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
],
# Modifications that should be included in the search
"modifications": [
"C,fix,any,Carbamidomethyl",
"M,opt,any,Oxidation",
"N,opt,any,Deamidated",
"Q,opt,any,Deamidated",
"E,opt,any,Methyl",
"K,opt,any,Methyl",
"R,opt,any,Methyl",
"*,opt,Prot-N-term,Acetyl",
"S,opt,any,Phospho",
"T,opt,any,Phospho",
],
}
# We specify all search engines and validation engines that we want
# to use in a list (version numbers might differ on windows or mac):
search_engines = [
"omssa",
"xtandem_piledriver",
"msgfplus_v9979",
# 'myrimatch_2_1_138',
"msamanda_1_0_0_5243",
]
validation_engines = [
"percolator_2_08",
"qvality",
]
# Groups that are evaluated seperately
groups = {
"0": "",
"1": "Oxidation",
"2": "Deamidated",
"3": "Methyl",
"4": "Acetyl",
"5": "Phospho",
}
mass_spectrometer = "LTQ XL low res"
get_params = {
"ftp_url": "ftp.peptideatlas.org",
"ftp_login": "PASS00269",
"ftp_password": "FI4645a",
"ftp_include_ext": [
"JB_FASP_pH8_2-3_28122012.mzML",
"JB_FASP_pH8_2-4_28122012.mzML",
"JB_FASP_pH8_3-1_28122012.mzML",
"JB_FASP_pH8_4-1_28122012.mzML",
],
"ftp_output_folder": os.path.join(os.pardir, "example_data", "grouped_search"),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(os.pardir, "example_data"),
}
def get_files():
uc = ursgal.UController(params=get_params)
if os.path.exists(params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
if os.path.exists(get_params["ftp_output_folder"]) is False:
os.makedirs(get_params["ftp_output_folder"])
uc.fetch_file(engine="get_ftp_files_1_0_0")
spec_files = []
for mzML_file in glob.glob(os.path.join(get_params["ftp_output_folder"], "*.mzML")):
spec_files.append(mzML_file)
return spec_files
def search(validation_engine):
"""
Executes a grouped search on four example files from the
data from Barth et al.
usage:
./grouped_search_example.py
Searches for peptides including the following potential modifications:
oxidation of M,
deamidation of N/Q,
methylation of E/K/R,
N-terminal acetylation,
phosphorylation of S/T.
After the search, each type of modification is validated seperately.
"""
# Initializing the ursgal UController class with
# our specified modifications and mass spectrometer
uc = ursgal.UController(
profile=mass_spectrometer, params=params # 'LTQ XL low res' profile!
)
# complete workflow:
# every spectrum file is searched with every search engine,
# results are seperated into groups and validated seperately,
# validated results are merged and filtered for targets and PEP <= 0.01.
# In the end, all filtered results from all spectrum files are merged
# for validation_engine in validation_engines:
result_files = []
for n, spec_file in enumerate(spec_files):
validated_results = []
for search_engine in search_engines:
unified_search_results = uc.search(
input_file=spec_file,
engine=search_engine,
)
# Calculate PEP for every group seperately, therefore need to split
# the csv first
group_list = sorted(groups.keys())
for p, group in enumerate(group_list):
if group == "0":
uc.params["csv_filter_rules"] = [
["Modifications", "contains_not", "{0}".format(groups["1"])],
["Modifications", "contains_not", "{0}".format(groups["2"])],
["Modifications", "contains_not", "{0}".format(groups["3"])],
["Modifications", "contains_not", "{0}".format(groups["4"])],
["Modifications", "contains_not", "{0}".format(groups["5"])],
]
else:
uc.params["csv_filter_rules"] = [
["Modifications", "contains", "{0}".format(groups[group])]
]
for other_group in group_list:
if other_group == "0" or other_group == group:
continue
uc.params["csv_filter_rules"].append(
[
"Modifications",
"contains_not",
"{0}".format(groups[other_group]),
],
)
uc.params["prefix"] = "grouped-{0}".format(group)
filtered_results = uc.execute_misc_engine(
input_file=unified_search_results, engine="filter_csv"
)
uc.params["prefix"] = ""
validated_search_results = uc.validate(
input_file=filtered_results,
engine=validation_engine,
)
validated_results.append(validated_search_results)
uc.params["prefix"] = "file{0}".format(n)
validated_results_from_all_engines = uc.execute_misc_engine(
input_file=sorted(validated_results),
engine="merge_csvs",
)
uc.params["prefix"] = ""
uc.params["csv_filter_rules"] = [
["Is decoy", "equals", "false"],
["PEP", "lte", 0.01],
]
filtered_validated_results = uc.execute_misc_engine(
input_file=validated_results_from_all_engines, engine="filter_csv"
)
result_files.append(filtered_validated_results)
results_all_files = uc.execute_misc_engine(
input_files=sorted(result_files),
engine="merge_csvs",
)
return results_all_files
def analyze(collector):
"""
Simle analysis script for the grouped search,
counting the number of identified peptides (combination of peptide sequence and modifications)
and PSMs (additionally include the spectrum ID)
"""
mod_list = ["Oxidation", "Deamidated", "Methyl", "Acetyl", "Phospho"]
fieldnames = (
["approach", "count_type", "validation_engine", "unmodified", "multimodified"]
+ mod_list
+ ["total"]
)
csv_writer = csv.DictWriter(open("grouped_results.csv", "w"), fieldnames)
csv_writer.writeheader()
uc = ursgal.UController()
uc.params["validation_score_field"] = "PEP"
uc.params["bigger_scores_better"] = False
# Count the number of identified peptides and PSMs for the different modifications
# Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted
# and only if the best hit has a PEP that is at least two orders of
# magnitude smaller than the others
for validation_engine, result_file in collector.items():
counter_dict = {"psm": ddict(set), "pep": ddict(set)}
grouped_psms = uc._group_psms(
result_file, validation_score_field="PEP", bigger_scores_better=False
)
for spec_title, grouped_psm_list in grouped_psms.items():
best_score, best_line_dict = grouped_psm_list[0]
if len(grouped_psm_list) > 1:
second_best_score, second_best_line_dict = grouped_psm_list[1]
best_peptide_and_mod = (
best_line_dict["Sequence"] + best_line_dict["Modifications"]
)
second_best_peptide_and_mod = (
second_best_line_dict["Sequence"]
+ second_best_line_dict["Modifications"]
)
if best_peptide_and_mod == second_best_peptide_and_mod:
line_dict = best_line_dict
elif best_line_dict["Sequence"] == second_best_line_dict["Sequence"]:
if best_score == second_best_score:
line_dict = best_line_dict
else:
if (-1 * math.log10(best_score)) - (
-1 * math.log10(second_best_score)
) >= 2:
line_dict = best_line_dict
else:
continue
else:
if (-1 * math.log10(best_score)) - (
-1 * math.log10(second_best_score)
) >= 2:
line_dict = best_line_dict
else:
continue
else:
line_dict = best_line_dict
count = 0
for mod in mod_list:
if mod in line_dict["Modifications"]:
count += 1
key_2_add = ""
if count == 0:
key_2_add = "unmodified"
elif count >= 2:
key_2_add = "multimodified"
elif count == 1:
for mod in mod_list:
if mod in line_dict["Modifications"]:
key_2_add = mod
break
# for peptide identification comparison
counter_dict["pep"][key_2_add].add(
line_dict["Sequence"] + line_dict["Modifications"]
)
# for PSM comparison
counter_dict["psm"][key_2_add].add(
line_dict["Spectrum Title"]
+ line_dict["Sequence"]
+ line_dict["Modifications"]
)
for counter_key, count_dict in counter_dict.items():
dict_2_write = {
"approach": "grouped",
"count_type": counter_key,
"validation_engine": validation_engine,
}
total_number = 0
for key, obj_set in count_dict.items():
dict_2_write[key] = len(obj_set)
total_number += len(obj_set)
dict_2_write["total"] = total_number
csv_writer.writerow(dict_2_write)
return
if __name__ == "__main__":
spec_files = get_files()
collector = {}
for validation_engine in validation_engines:
results_all_files = search(validation_engine)
print(
">>> ",
"final results for {0}".format(validation_engine),
" were written into:",
)
print(">>> ", results_all_files)
collector[validation_engine] = results_all_files
analyze(collector)
print(">>> ", "number of identified peptides and PSMs were written into:")
print(">>> ", "grouped_results.csv")
Example results for cascade search¶
ROS dataset results from a subsset of data from Barth et al. 2014.
approach | count_type | validation_engine | unmodified | multimodified | Oxidation | Deamidated | Methyl | Acetyl | Phospho | total |
---|---|---|---|---|---|---|---|---|---|---|
ungrouped | psm | qvality | 6905 | 24 | 270 | 157 | 42 | 23 | 6 | 7427 |
grouped | psm | qvality | 8009 | 267 | 128 | 31 | 27 | 3 | 8465 | |
cascade | psm | qvality | 8009 | 316 | 131 | 46 | 30 | 1 | 7788 | |
ungrouped | upep | qvality | 1409 | 14 | 94 | 48 | 20 | 11 | 3 | 1599 |
grouped | upep | qvality | 1575 | 93 | 42 | 9 | 17 | 1 | 1737 | |
cascade | upep | qvality | 1552 | 102 | 44 | 13 | 13 | 1 | 1725 |
Human BR dataset results.
approach | count_type | validation_engine | unmodified | multimodified | Oxidation | Deamidated | Methyl | Acetyl | Phospho | total |
---|---|---|---|---|---|---|---|---|---|---|
ungrouped | psm | qvality | 18813 | 51 | 332 | 336 | 102 | 292 | 122 | 20048 |
grouped | psm | qvality | 21370 | 383 | 171 | 100 | 435 | 142 | 22601 | |
cascade | psm | qvality | 19065 | 424 | 251 | 111 | 417 | 38 | 20406 | |
ungrouped | upep | qvality | 6707 | 39 | 143 | 173 | 27 | 123 | 58 | 7270 |
grouped | upep | qvality | 7525 | 161 | 104 | 31 | 191 | 58 | 8070 | |
cascade | upep | qvality | 7784 | 172 | 152 | 43 | 171 | 55 | 8377 |
Upeptide Mapper Example Scripts¶
Upeptide mapper v3 and 4 complete C. reinhardtii proteome match¶
-
complete_chlamydomonas_proteome_match.
main
(class_version)¶ Example script to demonstrate speed and memory efficiency of the new upeptide_mapper.
All tryptic peptides (n=1,094,395, 6 < len(peptide) < 40 ) are mapped to the Chlamydomonas reinhardtii (38876 entries) target-decoy database.
- usage:
- ./complete_chlamydomonas_proteome_match.py <class_version>
- Class versions
- UPeptideMapper_v2
- UPeptideMapper_v3
- UPeptideMapper_v4
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import os.path
import sys
import time
def main(class_version):
"""
Example script to demonstrate speed and memory efficiency of the new
upeptide_mapper.
All tryptic peptides (n=1,094,395, 6 < len(peptide) < 40 ) are mapped to the
Chlamydomonas reinhardtii (38876 entries) target-decoy database.
usage:
./complete_chlamydomonas_proteome_match.py <class_version>
Class versions
* UPeptideMapper_v2
* UPeptideMapper_v3
* UPeptideMapper_v4
"""
input_params = {
"database": os.path.join(
os.pardir,
"example_data",
"Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
),
"http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
"http_output_folder": os.path.join(
os.pardir,
"example_data",
),
}
uc = ursgal.UController(params=input_params)
if os.path.exists(input_params["database"]) is False:
uc.fetch_file(engine="get_http_files_1_0_0")
print("Parsing fasta and digesting sequences")
peptides = set()
digest_start = time.time()
for fastaID, sequence in ursgal.ucore.parse_fasta(
open(input_params["database"], "r")
):
tryptic_peptides = ursgal.ucore.digest(
sequence, ("KR", "C"), no_missed_cleavages=True
)
for p in tryptic_peptides:
if 6 <= len(p) <= 40:
peptides.add(p)
print(
"Parsing fasta and digesting sequences took {0:1.2f} seconds".format(
time.time() - digest_start
)
)
if sys.platform == "win32":
print(
"[ WARNING ] pyahocorasick can not be installed via pip on Windwows at the moment\n"
"[ WARNING ] Falling back to UpeptideMapper_v2"
)
class_version = "UPeptideMapper_v2"
upapa_class = uc.unodes["upeptide_mapper_1_0_0"][
"class"
].import_engine_as_python_function(class_version)
print("Buffering fasta and mapping {0} peptides".format(len(peptides)))
map_start = time.time()
if class_version == "UPeptideMapper_v2":
peptide_mapper = upapa_class(word_len=6)
fasta_lookup_name = peptide_mapper.build_lookup_from_file(
input_params["database"],
force=False,
)
args = [list(peptides), fasta_lookup_name]
elif class_version == "UPeptideMapper_v3":
peptide_mapper = upapa_class(input_params["database"])
fasta_lookup_name = peptide_mapper.fasta_name
args = [list(peptides), fasta_lookup_name]
elif class_version == "UPeptideMapper_v4":
peptide_mapper = upapa_class(input_params["database"])
args = [list(peptides)]
p2p_mappings = peptide_mapper.map_peptides(*args)
print(
"Buffering fasta and mapping {0} peptides took {1:1.2f} seconds".format(
len(peptides), time.time() - map_start
)
)
if len(p2p_mappings.keys()) == len(peptides):
print("All peptides have been mapped!")
else:
print("WARNING: Not all peptide have been mapped")
if __name__ == "__main__":
if len(sys.argv) < 2:
print(main.__doc__)
sys.exit(1)
main(sys.argv[1])
Upeptide mapper v3 and 4 complete proteome match¶
-
complete_proteome_match.
main
(fasta_database, class_version)¶ Example script to demonstrate speed and memory efficiency of the new upeptide_mapper.
Specify fasta_database and class_version as input.
- usage:
- ./complete_proteome_match.py <fasta_database> <class_version>
- Class versions
- UPeptideMapper_v2
- UPeptideMapper_v3
- UPeptideMapper_v4
#!/usr/bin/env python3
# encoding: utf-8
import ursgal
import glob
import os.path
import sys
import time
def main(fasta_database, class_version):
"""
Example script to demonstrate speed and memory efficiency of the new
upeptide_mapper.
Specify fasta_database and class_version as input.
usage:
./complete_proteome_match.py <fasta_database> <class_version>
Class versions
* UPeptideMapper_v2
* UPeptideMapper_v3
* UPeptideMapper_v4
"""
input_params = {
"database": sys.argv[1],
}
uc = ursgal.UController(params=input_params)
print("Parsing fasta and digesting sequences")
peptides = set()
digest_start = time.time()
for fastaID, sequence in ursgal.ucore.parse_fasta(
open(input_params["database"], "r")
):
tryptic_peptides = ursgal.ucore.digest(
sequence,
("KR", "C"),
# no_missed_cleavages = True
)
for p in tryptic_peptides:
if 6 <= len(p) <= 40:
peptides.add(p)
print(
"Parsing fasta and digesting sequences took {0:1.2f} seconds".format(
time.time() - digest_start
)
)
if sys.platform == "win32":
print(
"[ WARNING ] pyahocorasick can not be installed via pip on Windwows at the moment\n"
"[ WARNING ] Falling back to UpeptideMapper_v2"
)
class_version = "UPeptideMapper_v2"
upapa_class = uc.unodes["upeptide_mapper_1_0_0"][
"class"
].import_engine_as_python_function(class_version)
print("Buffering fasta and mapping {0} peptides".format(len(peptides)))
map_start = time.time()
if class_version == "UPeptideMapper_v2":
peptide_mapper = upapa_class(word_len=6)
fasta_lookup_name = peptide_mapper.build_lookup_from_file(
input_params["database"],
force=False,
)
args = [list(peptides), fasta_lookup_name]
elif class_version == "UPeptideMapper_v3":
peptide_mapper = upapa_class(input_params["database"])
fasta_lookup_name = peptide_mapper.fasta_name
args = [list(peptides), fasta_lookup_name]
elif class_version == "UPeptideMapper_v4":
peptide_mapper = upapa_class(input_params["database"])
args = [list(peptides)]
p2p_mappings = peptide_mapper.map_peptides(*args)
print(
"Buffering fasta and mapping {0} peptides took {1:1.2f} seconds".format(
len(peptides), time.time() - map_start
)
)
if len(p2p_mappings.keys()) == len(peptides):
print("All peptides have been mapped!")
else:
print("WARNING: Not all peptide have been mapped")
if __name__ == "__main__":
if len(sys.argv) < 3:
print(main.__doc__)
sys.exit(1)
main(sys.argv[1], sys.argv[2])