Example Scripts

simple example using combined fdr (or pep)

simple_combined_fdr_score.main()

Executes a search with 3 different search engines on an example file from the data from Barth et al. (The same file that is used in the XTandem version comparison example.)

usage:
./simple_combined_fdr_score.py

This is a simple example script to show how results from multiple search engines can be combined using the Combined FDR Score approach of Jones et al. (2009).

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os

def main():
    '''
    Executes a search with 3 different search engines on an example file from the
    data from Barth et al. (The same file that is used in the XTandem version
    comparison example.)

    usage:
        ./simple_combined_fdr_score.py


    This is a simple example script to show how results from multiple search engines
    can be combined using the Combined FDR Score approach of Jones et al. (2009).
    '''

    engine_list = [
        'omssa_2_1_9',
        'xtandem_piledriver',
        # 'myrimatch_2_1_138',
        'msgfplus_v9979',
    ]

    params = {
        'database' : os.path.join(
            os.pardir,
            'example_data',
            'Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta'
        ),
        'modifications' : [ ],
        'csv_filter_rules':[
            ['PEP'      , 'lte'    , 0.01]    ,
            ['Is decoy' , 'equals' , 'false']
        ],
        'ftp_url'       : 'ftp.peptideatlas.org',
        'ftp_login'         : 'PASS00269',
        'ftp_password'      : 'FI4645a',
        'ftp_include_ext'   : [
            'JB_FASP_pH8_2-3_28122012.mzML',
        ],
        'ftp_output_folder' : os.path.join(
            os.pardir,
            'example_data',
            'xtandem_version_comparison'
        ),
        'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta' ,
        'http_output_folder' : os.path.join(
            os.pardir,
            'example_data'
        )
    }

    if os.path.exists(params['ftp_output_folder']) is False:
        os.mkdir(params['ftp_output_folder'])

    uc = ursgal.UController(
        profile = 'LTQ XL low res' ,
        params = params
    )
    mzML_file = os.path.join(
        params['ftp_output_folder'],
        params['ftp_include_ext'][0]
    )
    if os.path.exists(mzML_file) is False:
        uc.fetch_file(
            engine     = 'get_ftp_files_1_0_0'
        )
    if os.path.exists(params['database']) is False:
        uc.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    validated_files_list = []
    for engine in engine_list:

        unified_result_file = uc.search(
            input_file = mzML_file,
            engine     = engine,
        )

        validated_file = uc.validate(
            input_file = unified_result_file,
            engine     = 'percolator_2_08',
        )

        validated_files_list.append( validated_file )

    combined_results = uc.combine_search_results(
        input_files     = validated_files_list,
        engine          = 'combine_FDR_0_1',
        # use combine_pep_1_0_0 for combined PEP :)
    )
    print('\n\tCombined results can be found here:')
    print(combined_results)
    return

if __name__ == '__main__':
    main()

do it all folder wide

do_it_all_folder_wide.main(folder=None, profile=None, target_decoy_database=None)

An example test script to search all mzML files which are present in the specified folder. The search is currently performed on 4 search engines and 2 validation engines.

The machine profile has to be specified as well as the target-decoy database.

usage:

./do_it_all_folder_wide.py <mzML_folder> <profile> <target_decoy_database>

Current profiles:

  • ‘QExactive+’
  • ‘LTQ XL low res’
  • ‘LTQ XL high res’
#!/usr/bin/env python3.4
# encoding: utf-8
import ursgal
import sys
import glob
import os

def main(folder=None, profile=None, target_decoy_database=None):
    '''
    An example test script to search all mzML files which are present in the
    specified folder. The search is currently performed on 4 search engines
    and 2 validation engines.

    The machine profile has to be specified as well as the target-decoy
    database.

    usage:

        ./do_it_all_folder_wide.py <mzML_folder> <profile> <target_decoy_database>


    Current profiles:

        * 'QExactive+'
        * 'LTQ XL low res'
        * 'LTQ XL high res'

    '''
    # define folder with mzML_files as sys.argv[1]
    mzML_files = []
    for mzml in glob.glob(os.path.join('{0}'.format(folder),'*.mzML')):
        mzML_files.append(mzml)

    mass_spectrometer = profile


    # We specify all search engines and validation engines that we want to use in a list
    # (version numbers might differ on windows or mac):
    search_engines  = [
        'omssa',
        'xtandem_vengeance',
        'msgfplus_v2016_09_16',
        # 'msamanda_1_0_0_6300',
        # 'myrimatch_2_1_138',
    ]

    validation_engines = [
        'percolator_2_08',
        'qvality',
    ]

    # Modifications that should be included in the search
    all_mods = [
        'C,fix,any,Carbamidomethyl',
        'M,opt,any,Oxidation',
        # 'N,opt,any,Deamidated',
        # 'Q,opt,any,Deamidated',
        # 'E,opt,any,Methyl',
        # 'K,opt,any,Methyl',
        # 'R,opt,any,Methyl',
        '*,opt,Prot-N-term,Acetyl',
        # 'S,opt,any,Phospho',
        # 'T,opt,any,Phospho',
        # 'N,opt,any,HexNAc'
    ]

    # Initializing the Ursgal UController class with
    # our specified modifications and mass spectrometer
    params = {
        'database'      : target_decoy_database,
        'modifications' : all_mods,
        'csv_filter_rules' : [
            ['Is decoy', 'equals', 'false'],
            ['PEP','lte', 0.01],
        ]
    }

    uc = ursgal.UController(
        profile = mass_spectrometer,
        params = params
    )

    # complete workflow:
    # every spectrum file is searched with every search engine,
    # results are validated (for each engine seperately),
    # validated results are merged and filtered for targets and PEP <= 0.01.
    # In the end, all filtered results from all spectrum files are merged
    for validation_engine in validation_engines:
        result_files = []
        for spec_file in mzML_files:
            validated_results = []
            for search_engine in search_engines:
                unified_search_results = uc.search(
                    input_file = spec_file,
                    engine     = search_engine,
                )
                validated_csv = uc.validate(
                    input_file  = unified_search_results,
                    engine      = validation_engine,
                )
                validated_results.append( validated_csv )

            validated_results_from_all_engines = uc.merge_csvs(
                    input_files = validated_results,
                )
            filtered_validated_results = uc.filter_csv(
                input_file = validated_results_from_all_engines,
                )
            result_files.append(filtered_validated_results)

        if len(result_files) > 1:
            results_all_files = uc.merge_csvs(
                input_files = result_files,
            )

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print(main.__doc__)
        exit()
    main(
        folder                = sys.argv[1],
        profile               = sys.argv[2],
        target_decoy_database = sys.argv[3],
    )

X!Tandem version comparison

xtandem_version_comparison.main()

Executes a search with 4 versions of X!Tandem on an example file from the data from Barth et al.

usage:
./xtandem_version_comparison.py

This is a simple example file to show the straightforward comparison of different program versions of X!Tandem

Creates a Venn diagram with the peptides obtained by the different versions.

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os

def main():
    '''
    Executes a search with 4 versions of X!Tandem on an example file from the
    data from Barth et al.

    usage:
        ./xtandem_version_comparison.py


    This is a simple example file to show the straightforward comparison of
    different program versions of X!Tandem

    Creates a Venn diagram with the peptides obtained by the different versions.


    '''
    engine_list = [
        'xtandem_cyclone',
        'xtandem_jackhammer',
        'xtandem_sledgehammer',
        'xtandem_piledriver',
        'xtandem_vengeance'

    ]

    params = {
        'database' : os.path.join(
            os.pardir,
            'example_data',
            'Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta'
        ),
        'modifications' : [ ],
        'csv_filter_rules':[
            ['PEP'      , 'lte'    , 0.01 ]    ,
            ['Is decoy' , 'equals' , 'false']
        ],
        'ftp_url'       : 'ftp.peptideatlas.org',
        'ftp_login'         : 'PASS00269',
        'ftp_password'      : 'FI4645a',
        'ftp_include_ext'   : [
            'JB_FASP_pH8_2-3_28122012.mzML',
        ],
        'ftp_output_folder' : os.path.join(
            os.pardir,
            'example_data',
            'xtandem_version_comparison'
        ),
        'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta' ,
        'http_output_folder' : os.path.join(
            os.pardir,
            'example_data'
        )
    }

    if os.path.exists(params['ftp_output_folder']) is False:
        os.mkdir(params['ftp_output_folder'])

    uc = ursgal.UController(
        profile = 'LTQ XL low res' ,
        params = params
    )
    mzML_file = os.path.join(
        params['ftp_output_folder'],
        params['ftp_include_ext'][0]
    )
    if os.path.exists(mzML_file) is False:
        uc.fetch_file(
            engine     = 'get_ftp_files_1_0_0'
        )
    if os.path.exists(params['database']) is False:
        uc.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    filtered_files_list = []
    for engine in engine_list:
        unified_result_file = uc.search(
            input_file = mzML_file,
            engine     = engine,
        )

        validated_file = uc.validate(
            input_file = unified_result_file,
            engine     = 'percolator_2_08',
        )

        filtered_file = uc.filter_csv(
            input_file = validated_file,
        )

        filtered_files_list.append( filtered_file )

    uc.visualize(
        input_files     = filtered_files_list,
        engine          = 'venndiagram',
    )
    return

if __name__ == '__main__':
    main()

BSA machine ppm offset example

bsa_ppm_offset_test.main()

Example script to do a simple machine ppm offset parameter sweep. The m/z values in the example mgf file are stepwise changed and the in the final output the total peptides are counted.

usage:
./bsa_ppm_offset_test.py

As expected, if the offset becomes to big no peptides can be found anymore.

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import glob
import csv
from collections import defaultdict as ddict
import os
import shutil


def main():
    '''
    Example script to do a simple machine ppm offset parameter sweep.
    The m/z values in the example mgf file are stepwise changed and the in the
    final output the total peptides are counted.

    usage:
        ./bsa_ppm_offset_test.py

    As expected, if the offset becomes to big no peptides can be found anymore.

    '''
    ppm_offsets = [
        (-10e-6 , '-10_ppm_offset') ,
        (-9e-6  , '-9_ppm_offset')  ,
        (-8e-6  , '-8_ppm_offset')  ,
        (-7e-6  , '-7_ppm_offset')  ,
        (-6e-6  , '-6_ppm_offset')  ,
        (-5e-6  , '-5_ppm_offset')  ,
        (-4e-6  , '-4_ppm_offset')  ,
        (-3e-6  , '-3_ppm_offset')  ,
        (-2e-6  , '-2_ppm_offset')  ,
        (-1e-6  , '-1_ppm_offset')  ,
        (None   , '0_ppm_offset')   ,
        (1e-6   , '1_ppm_offset')   ,
        (2e-6   , '2_ppm_offset')   ,
        (3e-6   , '3_ppm_offset')   ,
        (4e-6   , '4_ppm_offset')   ,
        (5e-6   , '5_ppm_offset')   ,
        (6e-6   , '6_ppm_offset')   ,
        (7e-6   , '7_ppm_offset')   ,
        (8e-6   , '8_ppm_offset')   ,
        (9e-6   , '9_ppm_offset')   ,
        (10e-6  , '10_ppm_offset')  ,
    ]

    engine_list = [
        'xtandem_vengeance'
    ]

    R = ursgal.UController(
        profile = 'LTQ XL low res',
        params = {
            'database' : os.path.join(
                os.pardir,
                'example_data',
                'BSA.fasta'
            ),
            'modifications' : [
                'M,opt,any,Oxidation',        # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl'    # N-Acteylation
            ],
        }
    )

    mzML_file = os.path.join(
        os.pardir,
        'example_data',
        'BSA_machine_ppm_offset_example',
        'BSA1.mzML'
    )
    if os.path.exists(mzML_file) is False:
        R.params['http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'
        R.params['http_output_folder'] = os.path.dirname(mzML_file)
        R.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )
        try:
            shutil.move(
                '{0}?format=raw'.format(mzML_file),
                mzML_file
            )
        except:
            shutil.move(
                '{0}format=raw'.format(mzML_file),
                mzML_file
            )

    for engine in engine_list:
        for ( ppm_offset, prefix) in ppm_offsets:

            R.params['machine_offset_in_ppm'] = ppm_offset
            R.params['prefix'] = prefix

            unified_search_result_file = R.search(
                input_file = mzML_file,
                engine     = engine,
                force      = False,
            )

    collector = ddict(set)
    for csv_path in glob.glob('{0}/*/*unified.csv'.format(os.path.dirname(mzML_file))):
        for line_dict in csv.DictReader(open(csv_path, 'r')):
            collector[ csv_path ].add( line_dict['Sequence'] )
    for csv_path, peptide_set in sorted(collector.items()):
        print(
            'File {0} has {1} peptides'.format(
                os.path.basename(csv_path),
                len(peptide_set)
            )
        )

    return

if __name__ == '__main__':
    main()

Bruderer high resolution machine ppm offset example

qexactive_xtandem_version_comparison.main(folder)

Executes a search with 4 versions of X!Tandem on an example file from the data from Bruderer et al.

usage:
./xtandem_version_comparison.py

This is a simple example file to show the straightforward comparison of different program versions of X!Tandem when analyzing hish resolution data whihc can be better handled by version newer than Jackhammer. One gains approximately 10 percent more peptides with newer versions of X!Tandem.

Creates a Venn diagram with the peptides obtained by the different versions.

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os
import sys


def main(folder):
    '''
    Executes a search with 4 versions of X!Tandem on an example file from the
    data from Bruderer et al.

    usage:
        ./xtandem_version_comparison.py


    This is a simple example file to show the straightforward comparison of
    different program versions of X!Tandem when analyzing hish resolution data
    whihc can be better handled by version newer than Jackhammer. One gains
    approximately 10 percent more peptides with newer versions of X!Tandem.

    Creates a Venn diagram with the peptides obtained by the different versions.


    '''

    required_example_file = 'B_D140314_SGSDSsample1_R01_MSG_T0.mzML'

    if os.path.exists(os.path.join(folder,required_example_file)) is False:
        print(
            '''
            Your specified folder does not contain the required example file:
            {0}
            The RAW data from peptideatlas.org (PASS00589, password: WF6554orn)
            will be downloaded.
            Please convert to mzML after the download has finished and run this
            script again.
            '''.format(required_example_file)
        )

        ftp_get_params = {
            'ftp_url'                : 'ftp.peptideatlas.org',
            'ftp_login'              : 'PASS00589',
            'ftp_password'           : 'WF6554orn',
            'ftp_include_ext'        : [
                required_example_file.replace(
                    '.mzML',
                    '.raw'
                )
            ],
            'ftp_output_folder' : folder,
        }
        uc = ursgal.UController(
            params = ftp_get_params
        )
        uc.fetch_file(
            engine     = 'get_ftp_files_1_0_0'
        )
        exit()


    engine_list = [
        'xtandem_cyclone',
        'xtandem_jackhammer',
        'xtandem_sledgehammer',
        'xtandem_piledriver',
    ]

    params = {
        'database' : os.path.join(
            os.pardir,
            'example_data',
            'hs_201303_qs_sip_target_decoy.fasta'
        ),
        'modifications' : [ 'C,fix,any,Carbamidomethyl' ],
        'csv_filter_rules':[
            ['PEP'      , 'lte'    , 0.01]    ,
            ['Is decoy' , 'equals' , 'false']
        ],
        'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta' ,
        'http_output_folder' : os.path.join(
            os.pardir,
            'example_data'
        ),
        'machine_offset_in_ppm' : -5e-6,
    }

    uc = ursgal.UController(
        profile = 'QExactive+' ,
        params = params
    )

    if os.path.exists(params['database']) is False:
        uc.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    mzML_file = os.path.join(folder,required_example_file)

    filtered_files_list = []
    for engine in engine_list:

        unified_result_file = uc.search(
            input_file = mzML_file,
            engine     = engine,
            force      = False,
        )

        validated_file = uc.validate(
            input_file = unified_result_file,
            engine     = 'percolator_2_08',
        )

        filtered_file = uc.filter_csv(
            input_file = validated_file,
        )

        filtered_files_list.append( filtered_file )

    uc.visualize(
        input_files     = filtered_files_list,
        engine          = 'venndiagram',
    )
    return

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(main.__doc__)
        exit()
    main(sys.argv[1])

target decoy generation

target_decoy_generation_example.main()

Simple example script how to generate a target decoy database.

usage:

./target_decoy_generation_example.py
#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os


def main():
    '''
    Simple example script how to generate a target decoy database.

    usage:

        ./target_decoy_generation_example.py

    '''
    params = {
        'enzyme'                : 'trypsin',
        'decoy_generation_mode' : 'reverse_protein',
    }

    fasta_database_list = [
        os.path.join(
            os.pardir,
            'example_data',
            'BSA.fasta'
        )
    ]

    uc = ursgal.UController(
        params = params
    )

    new_target_decoy_db_name = uc.generate_target_decoy(
        input_files       = fasta_database_list,
        output_file_name = 'my_BSA_target_decoy.fasta',
    )
    print('Generated target decoy database: {0}'.format(new_target_decoy_db_name))

if __name__ == '__main__':
    main()

precursor mass tolerance example

bsa_precursor_mass_tolerance_example.main()

Example script to do a precursor mass tolerance parameter sweep.

usage:
./bsa_precursor_mass_tolerance_example.py
#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import glob
import csv
from collections import defaultdict as ddict
import os
import shutil

def main():
    '''
    Example script to do a precursor mass tolerance parameter sweep.

    usage:
        ./bsa_precursor_mass_tolerance_example.py


    '''
    precursor_mass_tolerance_list = [
        1 ,
        2 ,
        3 ,
        4 ,
        5 ,
        6 ,
        7 ,
        8 ,
        9 ,
        10 ,
    ]

    engine_list = [
        'xtandem_vengeance'
    ]

    R = ursgal.UController(
        profile = 'LTQ XL low res',
        params = {
            'database' : os.path.join(
                os.pardir,
                'example_data',
                'BSA.fasta'
            ),
            'modifications' : [
                'M,opt,any,Oxidation',        # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl'    # N-Acteylation
            ],
        }
    )

    mzML_file = os.path.join(
        os.pardir,
        'example_data',
        'BSA_precursor_mass_tolerance_example',
        'BSA1.mzML'
    )
    if os.path.exists(mzML_file) is False:

        R.params['http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'

        R.params['http_output_folder'] = os.path.dirname(mzML_file)
        R.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )
        try:
            shutil.move(
                '{0}?format=raw'.format(mzML_file),
                mzML_file
            )
        except:
            shutil.move(
                '{0}format=raw'.format(mzML_file),
                mzML_file
            )
    # Convert mzML to MGF outside the loop, so this step is not repeated in the loop
    mgf_file = R.convert_to_mgf_and_update_rt_lookup(
        input_file = mzML_file
    )

    for engine in engine_list:
        for precursor_mass_tolerance in precursor_mass_tolerance_list:

            R.params['precursor_mass_tolerance_minus'] = precursor_mass_tolerance
            R.params['precursor_mass_tolerance_plus'] = precursor_mass_tolerance

            R.params['prefix'] = '{0}_precursor_mass_tolerance_'.format(
                precursor_mass_tolerance
            )

            unified_search_result_file = R.search(
                input_file = mgf_file,
                engine     = engine,
                force      = False,
            )

    collector = ddict(set)
    for csv_path in glob.glob('{0}/*/*unified.csv'.format(os.path.dirname(mzML_file))):
        for line_dict in csv.DictReader(open(csv_path, 'r')):
            collector[ csv_path ].add( line_dict['Sequence'] )
    for csv_path, peptide_set in sorted(collector.items()):
        print(
            'File {0} has {1} peptides'.format(
                os.path.basename(csv_path),
                len(peptide_set)
            )
        )

    return

if __name__ == '__main__':
    main()

fragment mass tolerance example

bsa_fragment_mass_tolerance_example.main()

Example script to do a fragment mass tolerance parameter sweep.

usage:
./bsa_fragment_mass_tolerance_example.py

If the fragment mass tolerance becomes to small, ver few peptides are found. With this small sweep the actual min accuracy of a mass spectrometer can be estimated.

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import glob
import csv
from collections import defaultdict as ddict
import os
import shutil


def main():
    '''
    Example script to do a fragment mass tolerance parameter sweep.

    usage:
        ./bsa_fragment_mass_tolerance_example.py

    If the fragment mass tolerance becomes to small, ver few peptides are found.
    With this small sweep the actual min accuracy of a mass spectrometer can
    be estimated.

    '''
    fragment_mass_tolerance_list = [
        0.02,
        0.04,
        0.06,
        0.08,
        0.1,
        0.2,
        0.3,
        0.4,
        0.5,
    ]

    engine_list = [
        'xtandem_vengeance'
    ]

    R = ursgal.UController(
        profile = 'LTQ XL low res',
        params = {
            'database' : os.path.join(
                os.pardir,
                'example_data',
                'BSA.fasta'
            ),
            'modifications' : [
                'M,opt,any,Oxidation',        # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl'    # N-Acteylation
            ],
        }
    )

    mzML_file = os.path.join(
        os.pardir,
        'example_data',
        'BSA_fragment_mass_tolerance_example',
        'BSA1.mzML'
    )
    if os.path.exists(mzML_file) is False:
        R.params['http_url']='http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'
        R.params['http_output_folder'] = os.path.dirname(mzML_file)
        R.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )
        try:
            shutil.move(
                '{0}?format=raw'.format(mzML_file),
                mzML_file
            )
        except:
            shutil.move(
                '{0}format=raw'.format(mzML_file),
                mzML_file
            )

    # Convert mzML to MGF outside the loop, so this step is not repeated in the loop
    mgf_file = R.convert_to_mgf_and_update_rt_lookup(
        input_file = mzML_file
    )

    for engine in engine_list:
        for fragment_mass_tolerance in fragment_mass_tolerance_list:

            R.params[ 'frag_mass_tolerance' ] = fragment_mass_tolerance

            R.params['prefix'] = '{0}_fragment_mass_tolerance_'.format(
                fragment_mass_tolerance
            )

            unified_search_result_file = R.search(
                input_file = mgf_file,
                engine     = engine,
                force      = False,
            )

    collector = ddict(set)
    for csv_path in glob.glob('{0}/*/*unified.csv'.format(os.path.dirname(mzML_file))):
        for line_dict in csv.DictReader(open(csv_path, 'r')):
            collector[ csv_path ].add( line_dict['Sequence'] )
    for csv_path, peptide_set in sorted(collector.items()):
        print(
            'File {0} has {1} peptides'.format(
                os.path.basename(csv_path),
                len(peptide_set)
            )
        )
    return

if __name__ == '__main__':
    main()

filter csv examples

filter for modifications

filter_csv_for_mods_example.main()

Examples script for filtering unified results for modification containing entries

usage:
./filter_csv_for_mods_example.py

Will produce a file with only entries which contain Carbamidomethyl as a modification.

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os


def main():
    '''
    Examples script for filtering unified results for modification containing
    entries

    usage:
        ./filter_csv_for_mods_example.py


    Will produce a file with only entries which contain Carbamidomethyl as a
    modification.
    '''
    params = {
        'csv_filter_rules': [
            ['Modifications', 'contains', 'Carbamidomethyl'],
        ],
        'write_unfiltered_results':False

    }

    csv_file_to_filter = os.path.join(
        os.pardir,
        'example_data',
        'misc',
        'filter_csv_for_mods_example_omssa_2_1_9_unified.csv'
    )
    uc = ursgal.UController(
        params = params
    )

    filtered_csv = uc.filter_csv(
        input_file = csv_file_to_filter,
    )


if __name__ == '__main__':
    main()

filter validated results

filter_csv_validation_example.main()

Examples script for filtering validated results for a PEP <= 0.01 and remove all decoys.

usage:
./filter_csv_validation_example.py

Will produce a file with only target sequences with a posterior error probability of lower or equal to 1 percent

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os


def main():
    '''
    Examples script for filtering validated results for a PEP <= 0.01 and
    remove all decoys.

    usage:
        ./filter_csv_validation_example.py


    Will produce a file with only target sequences with a posterior error
    probability of lower or equal to 1 percent
    '''
    params = {
        'csv_filter_rules': [
            ['PEP', 'lte', 0.01],
            ['Is decoy', 'equals', 'false']
        ]
    }

    csv_file_to_filter = os.path.join(
        os.pardir,
        'example_data',
        'misc',
        'filter_csv_validation_example_omssa_2_1_9_unified_percolator_2_08_validated.csv'
    )
    uc = ursgal.UController(
        params = params
    )

    filtered_csv = uc.filter_csv(
        input_file = csv_file_to_filter,

    )


if __name__ == '__main__':
    main()

Machine ppm offset sweep

Reproduce data for figure 2

machine_offset_bruderer_sweep.search(input_folder=None)

Does the parameter sweep on every tenth MS2 spectrum of the data from Bruderer et al. (2015) und X!Tandem Sledgehammer.

Note

Please download the .RAW data for the DDA dataset from peptideatlas.org (PASS00589, password: WF6554orn) and convert to mzML. Then the script can be executed with the folder with the mzML files as the first argument.

Warning

This script (if the sweep ranges are not changed) will perform 10080 searches which will produce approximately 100 GB output (inclusive mzML files)

usage:

./machine_offset_bruderer_sweep.py <folder_with_bruderer_data>

Sweeps over:

  • machine_offset_in_ppm from -20 to +20 ppm offset
  • precursor mass tolerance from 1 to 5 ppm
  • fragment mass tolerance from -2.5 to 20 ppm

The search can be very time consuming (depending on your machine/cluster), therefor the analyze step can be performed separately by calling analyze() instead of search() when one has already performed the searches and wants to analyze the results.

machine_offset_bruderer_sweep.analyze(folder)

Parses the result files form search and write a result .csv file which contains the data to plot figure 2.

#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import glob
import csv
import os
from collections import defaultdict as ddict
import sys
import re


MQ_OFFSET_TO_FILENAME = [
    ( 4.71  , 'B_D140314_SGSDSsample2_R01_MSG_T0.mzML' ),
    ( 4.98  , 'B_D140314_SGSDSsample6_R01_MSG_T0.mzML' ),
    ( 4.9   , 'B_D140314_SGSDSsample1_R01_MSG_T0.mzML' ),
    ( 5.41  , 'B_D140314_SGSDSsample4_R01_MSG_T0.mzML' ),
    ( 5.78  , 'B_D140314_SGSDSsample5_R01_MSG_T0.mzML' ),
    ( 6.01  , 'B_D140314_SGSDSsample8_R01_MSG_T0.mzML' ),
    ( 6.22  , 'B_D140314_SGSDSsample7_R01_MSG_T0.mzML' ),
    ( 6.83  , 'B_D140314_SGSDSsample3_R01_MSG_T0.mzML' ),
    ( 7.61  , 'B_D140314_SGSDSsample4_R02_MSG_T0.mzML' ),
    ( 7.59  , 'B_D140314_SGSDSsample8_R02_MSG_T0.mzML' ),
    ( 7.93  , 'B_D140314_SGSDSsample6_R02_MSG_T0.mzML' ),
    ( 7.91  , 'B_D140314_SGSDSsample1_R02_MSG_T0.mzML' ),
    ( 8.23  , 'B_D140314_SGSDSsample3_R02_MSG_T0.mzML' ),
    ( 8.33  , 'B_D140314_SGSDSsample7_R02_MSG_T0.mzML' ),
    ( 9.2   , 'B_D140314_SGSDSsample5_R02_MSG_T0.mzML' ),
    ( 9.4   , 'B_D140314_SGSDSsample2_R02_MSG_T0.mzML' ),
    ( 9.79  , 'B_D140314_SGSDSsample1_R03_MSG_T0.mzML' ),
    ( 10.01 , 'B_D140314_SGSDSsample3_R03_MSG_T0.mzML' ),
    ( 10.03 , 'B_D140314_SGSDSsample7_R03_MSG_T0.mzML' ),
    ( 10.58 , 'B_D140314_SGSDSsample2_R03_MSG_T0.mzML' ),
    ( 11.1  , 'B_D140314_SGSDSsample4_R03_MSG_T0.mzML' ),
    ( 11.21 , 'B_D140314_SGSDSsample5_R03_MSG_T0.mzML' ),
    ( 11.45 , 'B_D140314_SGSDSsample6_R03_MSG_T0.mzML' ),
    ( 12.19 , 'B_D140314_SGSDSsample8_R03_MSG_T0.mzML' ),
]

GENERAL_PARAMS = {
    'database' : os.path.join(
        os.pardir,
        'example_data',
        'hs_201303_qs_sip_target_decoy.fasta'
    ),
    'modifications' : [
        'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
    ],
    'scan_skip_modulo_step' : 10,
    'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta' ,
    'http_output_folder' : os.path.join(
        os.pardir,
        'example_data'
    )
}

def search(input_folder=None):
    '''
    Does the parameter sweep on every tenth MS2 spectrum of the data from
    Bruderer et al. (2015) und X!Tandem Sledgehammer.

    Note:

        Please download the .RAW data for the DDA dataset from peptideatlas.org
        (PASS00589, password: WF6554orn) and convert to mzML.
        Then the script can be executed with the folder with the mzML files as
        the first argument.

    Warning:

        This script (if the sweep ranges are not changed) will perform 10080
        searches which will produce approximately 100 GB output (inclusive mzML
        files)

    usage:

        ./machine_offset_bruderer_sweep.py <folder_with_bruderer_data>

    Sweeps over:

        * machine_offset_in_ppm from -20 to +20 ppm offset
        * precursor mass tolerance from 1 to 5 ppm
        * fragment mass tolerance from -2.5 to 20 ppm

    The search can be very time consuming (depending on your machine/cluster),
    therefor the analyze step can be performed separately by calling analyze()
    instead of search() when one has already performed the searches and wants
    to analyze the results.
    '''

    file_2_target_offset = {}
    for MQ_offset, file_name in MQ_OFFSET_TO_FILENAME:
        file_2_target_offset[ file_name ] = { 'offsets' : [] }
        for n in range(-20, 20, 2):
            file_2_target_offset[ file_name ][ 'offsets' ].append(  n / 1e6 )

    engine_list                  = ['xtandem_sledgehammer']

    frag_ion_tolerance_list      = [ 2.5, 5, 10, 20 ]

    precursor_ion_tolerance_list = [ 1, 2, 3, 4, 5 ]

    R = ursgal.UController(
        profile = 'QExactive+',
        params = GENERAL_PARAMS
    )

    if os.path.exists(R.params['database']) is False:
        R.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    prefix_format_string = '_pit_{1}_fit_{2}'
    for mzML_path in glob.glob( os.path.join( input_folder, '*.mzML' ) ):

        mzML_basename = os.path.basename( mzML_path )
        if mzML_basename not in file_2_target_offset.keys():
            continue
        for ppm_offset in file_2_target_offset[ mzML_basename ]['offsets']:
            R.params['machine_offset_in_ppm'] = ppm_offset
            R.params['prefix']                = 'ppm_offset_{0}'.format(
                int(ppm_offset * 1e6)
            )
            mgf_file = R.convert_to_mgf_and_update_rt_lookup(
                input_file = mzML_path
            )
            for engine in engine_list:
                for precursor_ion_tolerane in precursor_ion_tolerance_list:
                    for frag_ion_tolerance in frag_ion_tolerance_list:

                        new_prefix = prefix_format_string.format(
                            precursor_ion_tolerane,
                            frag_ion_tolerance
                        )
                        R.params['precursor_mass_tolerance_minus'] = precursor_ion_tolerane
                        R.params['precursor_mass_tolerance_plus']  = precursor_ion_tolerane
                        R.params['frag_mass_tolerance']            = frag_ion_tolerance
                        R.params['prefix']                         = new_prefix

                        unified_search_result_file = R.search(
                            input_file = mzML_path,
                            engine     = engine,
                            force      = False,
                        )

    return

def analyze(folder):
    '''

    Parses the result files form search and write a result .csv file which
    contains the data to plot figure 2.

    '''

    R = ursgal.UController(
        profile = 'QExactive+',
        params = GENERAL_PARAMS
    )
    csv_collector = {}
    ve ='qvality_2_02'

    sample_regex_pattern = 'sample\d_R0\d'

    sample_2_x_pos_and_mq_offset = {}

    sample_offset_combos = []

    all_tested_offsets = [ str(n) for n in range(-20,21,2) ]

    for pos,(mq_ppm_off, mzML_file) in enumerate(MQ_OFFSET_TO_FILENAME):
        _sample = re.search(sample_regex_pattern, mzML_file).group()
        sample_2_x_pos_and_mq_offset[ _sample ] = (pos,mq_ppm_off)
        for theo_offset in all_tested_offsets:
            sample_offset_combos.append( ( _sample,theo_offset )  )

    for csv_path in glob.glob(os.path.join('{0}'.format(folder),'*','*_unified.csv')):
        dirname                 = os.path.dirname(csv_path)
        sample                  = re.search(sample_regex_pattern, csv_path).group()
        splitted_basename       = os.path.basename(csv_path).split('_')
        offset                  = splitted_basename[2]
        precursor_ion_tolerance = splitted_basename[4]
        frag_ion_tolerance      = splitted_basename[6]
        prefix                  = '_'.join(splitted_basename[:7])

        R.params['machine_offset_in_ppm']          = offset
        R.params['precursor_mass_tolerance_minus'] = precursor_ion_tolerance
        R.params['precursor_mass_tolerance_plus']  = precursor_ion_tolerance
        R.params['frag_mass_tolerance']            = frag_ion_tolerance
        R.params['prefix']                         = prefix

        validated_path = csv_path.replace(
            '_unified.csv',
            '_{0}_validated.csv'.format(ve)
        )
        if os.path.exists(validated_path):
            csv_path = validated_path
        else:
            try:
                csv_path = R.validate(
                    input_file = csv_path,
                    engine     = ve
                )
            except:
                continue

        pit_fit = (precursor_ion_tolerance,frag_ion_tolerance)

        if pit_fit not in csv_collector.keys():
            csv_collector[ pit_fit ] = ddict(set)

        csv_key = (sample,offset)

        print('Reading file: {0}'.format(csv_path))
        for line_dict in csv.DictReader(open(csv_path,'r')):
            if line_dict['Is decoy'] == 'true':
                continue
            if float(line_dict['PEP']) <= 0.01:
                csv_collector[ pit_fit ][ csv_key ].add(
                    '{0}{1}'.format(
                        line_dict['Sequence'],
                        line_dict['Modifications']
                    )
                )

    fieldnames = [
        'Sample',
        'pos',
        'MQ_offset',
        'tested_ppm_offset',
        'peptide_count'
    ]

    outfile_name_format_string = 'bruderer_data_ppm_sweep_precursor_mass_tolerance_{0}_fragment_mass_tolerance_{1}.csv'

    for pit_fit in csv_collector.keys():
        with open(outfile_name_format_string.format(*pit_fit),'w') as io:
            csv_writer = csv.DictWriter(io,fieldnames)
            csv_writer.writeheader()

            # write missing values
            for sample_offset in sample_offset_combos:
                sample, ppm_offset = sample_offset
                if sample_offset not in csv_collector[pit_fit].keys():
                    dict_2_write = {
                        'Sample'            : sample,
                        'pos'               : sample_2_x_pos_and_mq_offset[sample][0],
                        'MQ_offset'         : '',
                        'tested_ppm_offset' : ppm_offset,
                        'peptide_count'     : 0,
                    }
                    csv_writer.writerow(dict_2_write)

            for (sample, ppm_offset), peptide_set in csv_collector[pit_fit].items():
                dict_2_write = {
                    'Sample'            : sample,
                    'pos'               : sample_2_x_pos_and_mq_offset[sample][0],
                    'MQ_offset'         : sample_2_x_pos_and_mq_offset[sample][1] * -1,
                    'tested_ppm_offset' : ppm_offset,
                    'peptide_count'     : len(peptide_set),
                }
                csv_writer.writerow(dict_2_write)
    return

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(search.__doc__)
        exit()
    search(sys.argv[1])
    analyze(sys.argv[1])

Large scale data analysis

barth_et_al_large_scale.main(folder)

Example script for reproducing the data for figure 3

usage:

./barth_et_al_large_scale.py <folder>

The folder determines the target folder where the files will be downloaded

Chlamydomonas reinhardtii samples

Three biological replicates of 4 conditions (2_3, 2_4, 3_1, 4_1)

For more details on the samples please refer to Barth, J.; Bergner, S. V.; Jaeger, D.; Niehues, A.; Schulze, S.; Scholz, M.; Fufezan, C. The interplay of light and oxygen in the reactive oxygen stress response of Chlamydomonas reinhardtii dissected by quantitative mass spectrometry. MCP 2014, 13 (4), 969–989.

Merge all search results (per biological replicate and condition, on folder level) on engine level and validate via percolator.

‘LTQ XL high res’:

  • repetition 1
  • repetition 2

‘LTQ XL low res’:

  • repetition 3

Database:

  • Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta

Note

The database and the files will be automatically downloaded from our webpage and peptideatlas

#!/usr/bin/env python3.4
# encoding: utf-8


import ursgal
import glob
import os.path
import sys


def main(folder):
    '''

    Example script for reproducing the data for figure 3

    usage:

        ./barth_et_al_large_scale.py <folder>

    The folder determines the target folder where the files will be downloaded

    Chlamydomonas reinhardtii samples

    Three biological replicates of 4 conditions (2_3, 2_4, 3_1, 4_1)

    For more details on the samples please refer to
    Barth, J.; Bergner, S. V.; Jaeger, D.; Niehues, A.; Schulze, S.; Scholz,
    M.; Fufezan, C. The interplay of light and oxygen in the reactive oxygen
    stress response of Chlamydomonas reinhardtii dissected by quantitative mass
    spectrometry. MCP 2014, 13 (4), 969–989.

    Merge all search results (per biological replicate and condition, on folder
    level) on engine level and validate via percolator.

    'LTQ XL high res':

        * repetition 1
        * repetition 2

    'LTQ XL low res':

        * repetition 3

    Database:

        * Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta

    Note:

        The database and the files will be automatically downloaded from our
        webpage and peptideatlas
    '''

    input_params = {
        'database' : os.path.join(
            os.pardir,
            'example_data',
            'Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta'
        ),
        'modifications' : [
            'M,opt,any,Oxidation',
            '*,opt,Prot-N-term,Acetyl',  # N-Acetylation
        ],
        'ftp_url'       : 'ftp.peptideatlas.org',

        'ftp_login'         : 'PASS00269',
        'ftp_password'      : 'FI4645a',

        'ftp_output_folder_root' : folder,
        'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta' ,
        'http_output_folder' : os.path.join(
            os.pardir,
            'example_data'
        )
    }

    uc = ursgal.UController(
        params = input_params
    )

    if os.path.exists(input_params['database']) is False:
        uc.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    output_folder_to_file_list ={
        ('rep1_sample_2_3','LTQ XL high res') : [
            'CF_07062012_pH8_2_3A.mzML',
            'CF_13062012_pH3_2_3A.mzML',
            'CF_13062012_pH4_2_3A.mzML',
            'CF_13062012_pH5_2_3A.mzML',
            'CF_13062012_pH6_2_3A.mzML',
            'CF_13062012_pH11FT_2_3A.mzML',
        ],
        ('rep1_sample_2_4','LTQ XL high res') : [
            'CF_07062012_pH8_2_4A.mzML',
            'CF_13062012_pH3_2_4A_120615113039.mzML',
            'CF_13062012_pH4_2_4A.mzML',
            'CF_13062012_pH5_2_4A.mzML',
            'CF_13062012_pH6_2_4A.mzML',
            'CF_13062012_pH11FT_2_4A.mzML',

        ],
        ('rep1_sample_3_1','LTQ XL high res') : [
            'CF_12062012_pH8_1_3A.mzML',
            'CF_13062012_pH3_1_3A.mzML',
            'CF_13062012_pH4_1_3A.mzML',
            'CF_13062012_pH5_1_3A.mzML',
            'CF_13062012_pH6_1_3A.mzML',
            'CF_13062012_pH11FT_1_3A.mzML',
        ],
        ('rep1_sample_4_1','LTQ XL high res') : [
            'CF_07062012_pH8_1_4A.mzML',
            'CF_13062012_pH3_1_4A.mzML',
            'CF_13062012_pH4_1_4A.mzML',
            'CF_13062012_pH5_1_4A.mzML',
            'CF_13062012_pH6_1_4A.mzML',
            'CF_13062012_pH11FT_1_4A.mzML',
        ],

        ('rep2_sample_2_3','LTQ XL high res') : [
            'JB_18072012_2-3_A_FT.mzML',
            'JB_18072012_2-3_A_pH3.mzML',
            'JB_18072012_2-3_A_pH4.mzML',
            'JB_18072012_2-3_A_pH5.mzML',
            'JB_18072012_2-3_A_pH6.mzML',
            'JB_18072012_2-3_A_pH8.mzML',
        ],
        ('rep2_sample_2_4','LTQ XL high res') : [

            'JB_18072012_2-4_A_FT.mzML',
            'JB_18072012_2-4_A_pH3.mzML',
            'JB_18072012_2-4_A_pH4.mzML',
            'JB_18072012_2-4_A_pH5.mzML',
            'JB_18072012_2-4_A_pH6.mzML',
            'JB_18072012_2-4_A_pH8.mzML',

        ],
        ('rep2_sample_3_1','LTQ XL high res') : [
            'JB_18072012_3-1_A_FT.mzML',
            'JB_18072012_3-1_A_pH3.mzML',
            'JB_18072012_3-1_A_pH4.mzML',
            'JB_18072012_3-1_A_pH5.mzML',
            'JB_18072012_3-1_A_pH6.mzML',
            'JB_18072012_3-1_A_pH8.mzML',
        ],
        ('rep2_sample_4_1','LTQ XL high res') : [
            'JB_18072012_4-1_A_FT.mzML',
            'JB_18072012_4-1_A_pH3.mzML',
            'JB_18072012_4-1_A_pH4.mzML',
            'JB_18072012_4-1_A_pH5.mzML',
            'JB_18072012_4-1_A_pH6.mzML',
            'JB_18072012_4-1_A_pH8.mzML',
        ],

        ('rep3_sample_2_3','LTQ XL low res'): [
            'JB_FASP_pH3_2-3_28122012.mzML',
            'JB_FASP_pH4_2-3_28122012.mzML',
            'JB_FASP_pH5_2-3_28122012.mzML',
            'JB_FASP_pH6_2-3_28122012.mzML',
            'JB_FASP_pH8_2-3_28122012.mzML',
            'JB_FASP_pH11-FT_2-3_28122012.mzML',
        ],
        ('rep3_sample_2_4','LTQ XL low res'): [
            'JB_FASP_pH3_2-4_28122012.mzML',
            'JB_FASP_pH4_2-4_28122012.mzML',
            'JB_FASP_pH5_2-4_28122012.mzML',
            'JB_FASP_pH6_2-4_28122012.mzML',
            'JB_FASP_pH8_2-4_28122012.mzML',
            'JB_FASP_pH11-FT_2-4_28122012.mzML',
        ],
        ('rep3_sample_3_1','LTQ XL low res'): [
            'JB_FASP_pH3_3-1_28122012.mzML',
            'JB_FASP_pH4_3-1_28122012.mzML',
            'JB_FASP_pH5_3-1_28122012.mzML',
            'JB_FASP_pH6_3-1_28122012.mzML',
            'JB_FASP_pH8_3-1_28122012.mzML',
            'JB_FASP_pH11-FT_3-1_28122012.mzML',
        ],
        ('rep3_sample_4_1','LTQ XL low res'): [
            'JB_FASP_pH3_4-1_28122012.mzML',
            'JB_FASP_pH4_4-1_28122012.mzML',
            'JB_FASP_pH5_4-1_28122012.mzML',
            'JB_FASP_pH6_4-1_28122012.mzML',
            'JB_FASP_pH8_4-1_28122012.mzML',
            'JB_FASP_pH11-FT_4-1_28122012_130121201449.mzML',
        ],
    }

    for (outfolder,profile), mzML_file_list in sorted(output_folder_to_file_list.items()):
        uc.params['ftp_output_folder'] = os.path.join(
            input_params['ftp_output_folder_root'],
            outfolder
        )
        uc.params['ftp_include_ext'] = mzML_file_list

        if os.path.exists(uc.params['ftp_output_folder']) is False:
            os.makedirs( uc.params['ftp_output_folder'] )

        uc.fetch_file(
            engine     = 'get_ftp_files_1_0_0'
        )

    if os.path.exists(input_params['database']) is False:
        uc.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    search_engines  = [
        'omssa_2_1_9',
        'xtandem_piledriver',
        'myrimatch_2_1_138',
        'msgfplus_v9979',
        'msamanda_1_0_0_5243',
    ]

    # This dict will be populated with the percolator-validated results
    # of each engine ( 3 replicates x4 conditions = 12 files each )
    percolator_results = {
        'omssa_2_1_9'        : [],
        'xtandem_piledriver' : [],
        'msgfplus_v9979'     : [],
        'myrimatch_2_1_138'  : [],
        'msamanda_1_0_0_5243': [],
    }



    five_files_for_venn_diagram = []

    for search_engine in search_engines:

        # This list will collect all 12 result files for each engine,
        # after Percolator validation and filtering for PSMs with a
        # FDR <= 0.01
        filtered_results_of_engine = []
        for mzML_dir_ext, mass_spectrometer in output_folder_to_file_list.keys():
        # for mass_spectrometer, replicate_dir in replicates:
            # for condition_dir in conditions:
            uc.set_profile( mass_spectrometer )

            mzML_dir = os.path.join(
                input_params['ftp_output_folder_root'],
                mzML_dir_ext
            )
            # i.e. /media/plan-f/mzML/Christian_Fufezan/ROS_Experiment_2012/Juni_2012/2_3/Tech_A/
            # all files ending with .mzml in that directory will be used!

            unified_results_list = []
            for filename in glob.glob( os.path.join( mzML_dir,'*.mzML') ):
                # print(filename)
                if filename.lower().endswith(".mzml"):
                    # print(filename)
                    unified_search_results = uc.search(
                        input_file = filename,
                        engine     = search_engine,
                    )
                    unified_results_list.append(
                        unified_search_results
                    )

            # Merging results from the 6 pH-fractions:
            merged_unified = uc.merge_csvs( unified_results_list )

            # Validation with Percolator:
            percolator_validated = uc.validate(
                input_file = merged_unified,
                engine     = 'percolator_2_08',  # one could replace this with 'qvality'
            )
            percolator_results[ search_engine ].append(
                percolator_validated
            )

            # At this point, the analysis is finished. We got
            # Percolator-validated results for each of the 3
            # replicates and 12 conditions.

            # But let's see how well the five search engines
            # performed! To compare, we collect all PSMs with
            # an estimated FDR <= 0.01 for each engine, and
            # plot this information with the VennDiagram UNode.
            # We will also use the Combine FDR Score method
            # to combine the results from all five engines,
            # and increase the number of identified peptides.


    five_large_merged = []
    filtered_final_results = []

    # We will estimate the FDR for all 60 files
    # (5 engines x12 files) when using percolator PEPs as
    # quality score
    uc.params['validation_score_field'] = 'PEP'
    uc.params['bigger_scores_better']   = False

    # To make obtain smaller CSV files (and make plotting
    # less RAM-intensive, we remove all decoys and PSMs above
    # 0.06 FDR
    uc.params['csv_filter_rules'] = [
        ['estimated_FDR', 'lte', 0.06],
        ['Is decoy', 'equals', 'false']
    ]
    for engine, percolator_validated_list in percolator_results.items():

        # unfiltered files for cFDR script
        twelve_merged = uc.merge_csvs( percolator_validated_list )

        twelve_filtered = []
        for one_of_12 in percolator_validated_list:
            one_of_12_FDR = uc.add_estimated_fdr(
                input_file = one_of_12,
            )
            one_of_12_FDR_filtered = uc.filter_csv(
                input_file = one_of_12_FDR,
            )
            twelve_filtered.append( one_of_12_FDR_filtered )

        # For the combined FDR scoring, we merge all 12 files:
        filtered_merged = uc.merge_csvs( twelve_filtered )

        five_large_merged.append( twelve_merged )
        filtered_final_results.append( filtered_merged )

    # The five big merged files of each engine are combined:
    cFDR = uc.combine_search_results(
        input_files = five_large_merged,
        engine      = 'combine_FDR_0_1',
    )

    # We estimate the FDR of this combined approach:
    uc.params['validation_score_field'] = 'Combined FDR Score'
    uc.params['bigger_scores_better']   = False

    cFDR_FDR = uc.add_estimated_fdr(
        input_file = cFDR,
    )

    # Removing decoys and low quality hits, to obtain a
    # smaller file:
    uc.params['csv_filter_rules'] = [
        ['estimated_FDR', 'lte', 0.06],
        ['Is decoy', 'equals', 'false']
    ]
    cFDR_filtered_results = uc.filter_csv(
        input_file = cFDR_FDR,
    )
    filtered_final_results.append( cFDR_filtered_results )

    # Since we produced quite a lot of files, let's print the full
    # paths to our most important result files so we find them quickly:
    print('''
    These files can now be easily parsed and plotted with your
    plotting tool of choice! We used the Python plotting library
    matplotlib. Each unique combination of Sequence, modification
    and charge was counted as a unique peptide.
    ''')
    print("\n########### Result files: ##############")
    for x in filtered_final_results:
        print( x )

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(main.__doc__)
        exit()
    main(sys.argv[1])

Complete workflow for human BR dataset analysis

human_br_complete_workflow.main(folder)

usage:

./human_br_complete_workflow.py <folder_with_human_br_files>

This scripts produces the data for figure 3.

#!/usr/bin/env python3.4
# encoding: utf-8
import ursgal
import os
import sys
import pprint

def main(folder):
    '''

    usage:

        ./human_br_complete_workflow.py <folder_with_human_br_files>

    This scripts produces the data for figure 3.

    '''

    # Initialize the UController:
    uc = ursgal.UController(
        params = {
            'enzyme'                : 'trypsin',
            'decoy_generation_mode' : 'reverse_protein',
        }
    )

    # MS Spectra, downloaded from http://proteomecentral.proteomexchange.org
    # via the dataset accession PXD000263 and converted to mzML


    mass_spec_files = [
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-01_013.mzML',
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-02_025.mzML',
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-03_033.mzML',
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-04_048.mzML',
    ]

    for mass_spec_file in mass_spec_files:
        if os.path.exists( os.path.join( folder, mass_spec_file ) ) is False:
            print('Please download RAW files to folder {} and convert to mzML:'.format(folder))
            pprint.pprint(mass_spec_files)
            exit()

    # mods from Wen et al. (2015):
    modifications = [
        'C,fix,any,Carbamidomethyl',  # Carbamidomethyl  (C) was set as fixed modification
        'M,opt,any,Oxidation',        # Oxidation (M) as well as
        'N,opt,any,Deamidated',       # Deamidated (NQ) were set as optional modification
        'Q,opt,any,Deamidated',       # Deamidated (NQ) were set as optional modification
    ]


    # The target peptide database which will be searched (UniProt Human reference proteome from July 2013)
    target_database = 'uniprot_human_UP000005640_created_until_20130707.fasta'
    # Let's turn it into a target decoy database by reversing peptides:
    target_decoy_database = uc.generate_target_decoy(
        input_file = target_database,
    )

    # OMSSA parameters from Wen et al. (2015):
    omssa_params = {
        # (used by default)                             # -w
        'he'                            : '1000',       # -he 1000
        'zcc'                           : '1',          # -zcc 1
        'frag_mass_tolerance'           : '0.6',        # -to 0.6
        'frag_mass_tolerance_unit'      : 'da',         # -to 0.6
        'precursor_mass_tolerance_minus': '10',         # -te 10
        'precursor_mass_tolerance_plus' : '10',         # -te 10
        'precursor_mass_tolerance_unit' : 'ppm',        # -teppm
        'score_a_ions'                  : False,        # -i 1,4
        'score_b_ions'                  : True,         # -i 1,4
        'score_c_ions'                  : False,        # -i 1,4
        'score_x_ions'                  : False,        # -i 1,4
        'score_y_ions'                  : True,         # -i 1,4
        'score_z_ions'                  : False,        # -i 1,4
        'enzyme'                        : 'trypsin_p',  # -e 10
        'maximum_missed_cleavages'      : '1',          # -v 1
        'precursor_max_charge'          : '8',          # -zh 8
        'precursor_min_charge'          : '1',          # -zl 1
        'tez'                           : '1',          # -tez 1
        'precursor_isotope_range'       : '0,1',        # -ti 1
        'num_match_spec'                : '1',          # -hc 1

        'database'                      : target_decoy_database,
        'modifications'                 : modifications,
    }


    # MS-GF+ parameters from Wen et al. (2015):
    msgf_params = {
        'precursor_mass_tolerance_unit' : 'ppm',      # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_minus': '10',       # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_plus' : '10',       # precursor ion mass tolerance was set to 10 ppm
                                                    # the max number of optional modifications per peptide were set as 3
        # (used by default)                           # number of allowed isotope errors was set as 1
        'enzyme'                        : 'trypsin',  # the enzyme was set as trypsin
        # (used by default)                           # fully enzymatic peptides were specified, i.e. no non-enzymatic termini
        'frag_method'                   : '1',        # the fragmentation method selected in the search was CID
        'max_pep_length'                : '45',       # the maximum peptide length to consider was set as 45
        'precursor_min_charge'          : '1',        # the minimum precursor charge to consider if charges are not specified in the spectrum file was set as 1
        'precursor_max_charge'          : '8',        # the maximum precursor charge to consider was set as 8
        # (used by default)                           # the parameter 'addFeatures' was set as 1 (required for Percolator)
                                                    # all of the other parameters were set as default
                                                    # the instrument selected was High-res

        'database'                      : target_decoy_database,
        'modifications'                 : modifications,
    }


    # X!Tandem parameters from Wen et al. (2015):
    xtandem_params = {
        'precursor_mass_tolerance_unit' : 'ppm',      # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_minus': '10',       # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_plus' : '10',       # precursor ion mass tolerance was set to 10 ppm
        'frag_mass_tolerance'           : '0.6',      # the fragment ion mass tolerance was set to 0.6 Da
        'frag_mass_tolerance_unit'      : 'da',       # the fragment ion mass tolerance was set to 0.6 Da
        'precursor_isotope_range'       : '0,1',      # parent monoisotopic mass isotope error was set as 'yes'
        'precursor_max_charge'          : '8',        # maximum parent charge of spectrum was set as 8
        'enzyme'                        : 'trypsin',  # the enzyme was set as trypsin ([RK]|[X])
        'maximum_missed_cleavages'      : '1',        # the maximum missed cleavage sites were set as 1
        # (used by default)                           # no model refinement was employed.

        'database'                      : target_decoy_database,
        'modifications'                 : modifications,
    }


    search_engine_settings = [
        ('msamanda_1_0_0_5243', xtandem_params, 'LTQ XL high res'),  # not used in Wen et al., so we use the same settings as xtandem
        ('myrimatch_2_1_138', xtandem_params, 'LTQ XL high res'),    # not used in Wen et al., so we use the same settings as xtandem
        ('msgfplus_v9979', msgf_params, 'LTQ XL high res'),          # the instrument selected was High-res
        ('xtandem_jackhammer', xtandem_params, None),
        ('omssa_2_1_9', omssa_params, None),
    ]

    merged_validated_files_3_engines = []
    merged_validated_files_5_engines = []

    for engine, wen_params, instrument in search_engine_settings:

        # Initializing the uPLANIT UController class with
        # our specified modifications and mass spectrometer
        uc = ursgal.UController(
            params = wen_params
        )

        if instrument is not None:
            uc.set_profile( instrument )

        unified_results              = []
        percolator_validated_results = []

        for mzML_file in mass_spec_files:
            unified_search_results = uc.search(
                input_file = mzML_file,
                engine     = engine,
            )
            unified_results.append(
                unified_search_results
            )
            validated_csv = uc.validate(
                input_file   = unified_search_results,
                engine       = 'percolator_2_08',
            )
            percolator_validated_results.append( validated_csv )

        merged_validated_csv = uc.merge_csvs(
                input_files = percolator_validated_results,
        )
        merged_unvalidated_csv = uc.merge_csvs(
                input_files = unified_results,
        )

        if engine in ["omssa_2_1_9", "xtandem_jackhammer", "msgfplus_v9979"]:
            merged_validated_files_3_engines.append( merged_validated_csv )
        merged_validated_files_5_engines.append( merged_validated_csv )


    uc.params['prefix'] = '5-engines-summary'
    uc.combine_search_results(
        input_files = merged_validated_files_5_engines,
        engine      = 'combine_FDR_0_1',
    )

    uc.params['prefix'] = '3-engines-summary'
    uc.combine_search_results(
        input_files = merged_validated_files_3_engines,
        engine      = 'combine_FDR_0_1',
    )


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(main.__doc__)
        exit()
    main(sys.argv[1])

mgf conversion loop examples

mgf_conversion_inside_and_outside_loop.main()
#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os
import shutil


def main():
    '''


    '''
    R = ursgal.UController(
        profile = 'LTQ XL low res',
        params = {
            'database': os.path.join( os.pardir, 'example_data', 'BSA.fasta' ),
            'modifications' : [
                'M,opt,any,Oxidation',        # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl'    # N-Acteylation[]
            ]
        },
    )

    engine = 'omssa'
    output_files = []

    mzML_file = os.path.join(
        os.pardir,
        'example_data',
        'mgf_conversion_example',
        'BSA1.mzML'
    )
    if os.path.exists(mzML_file) is False:
        R.params['http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'
        R.params['http_output_folder'] = os.path.dirname(mzML_file)
        R.fetch_file(
            engine = 'get_http_files_1_0_0'
        )
        try:
            shutil.move(
                '{0}?format=raw'.format(mzML_file),
                mzML_file
            )
        except:
            shutil.move(
                '{0}format=raw'.format(mzML_file),
                mzML_file
            )

    # First method: Convert to MGF outside of the loop:
    # (saves some time cause the MGF conversion is not always re-run)
    mgf_file = R.convert_to_mgf_and_update_rt_lookup(
        input_file       = mzML_file, # from OpenMS example files
    )
    for prefix in ['10ppm', '20ppm']:
        R.params['prefix'] = prefix

        output_file = R.search(
            input_file = mgf_file,
            engine     = engine,
            output_file_name = 'some_userdefined_name'
        )
        output_files.append( output_file )


    # Second method: Automatically convert to MGF inside the loop:
    # (MGF conversion is re-run every time because the prexix changed!)
    for prefix in ['5ppm', '15ppm']:
        R.params['prefix'] = prefix

        output_file = R.search(
            input_file = mzML_file, # from OpenMS example files
            engine     = engine,
            output_file_name = 'another_fname',
        )
        output_files.append( output_file )


    print('\tOutput files:')
    for f in output_files:
        print(f)

if __name__ == '__main__':
    print(__doc__)
    main()

Example search with search for labeling with 15N and no label

search_with_label_15N.main()

Executes a search with 3 different search engines on an example file from the data from Barth et al. Two searches are performed pe engine for each 14N (unlabeled) and 15N labeling.

usage:
./search_with_label_15N.py
#!/usr/bin/env python3.4
# encoding: utf-8

import ursgal
import os


def main():
    '''
    Executes a search with 3 different search engines on an example file from
    the data from Barth et al. Two searches are performed pe engine for each
    14N (unlabeled) and 15N labeling.

    usage:
        ./search_with_label_15N.py



    '''

    engine_list = [
        'omssa_2_1_9',
        'xtandem_piledriver',
        'msgfplus_v9979',
    ]

    params = {
        'database' : os.path.join(
            os.pardir,
            'example_data',
            'Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta'
        ),
        'modifications' : [ ],
        'csv_filter_rules' : [
            ['PEP'      , 'lte'    , 0.01],
            ['Is decoy' , 'equals' , 'false']
        ],
        'ftp_url'       : 'ftp.peptideatlas.org',
        'ftp_login'         : 'PASS00269',
        'ftp_password'      : 'FI4645a',
        'ftp_include_ext'   : [
            'JB_FASP_pH8_2-3_28122012.mzML',
        ],
        'ftp_output_folder' : os.path.join(
            os.pardir,
            'example_data',
            'search_with_label_15N'
        ),
        'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta' ,
        'http_output_folder' : os.path.join(
            os.pardir,
            'example_data'
        )
    }

    if os.path.exists(params['ftp_output_folder']) is False:
        os.mkdir(params['ftp_output_folder'])

    uc = ursgal.UController(
        profile = 'LTQ XL low res' ,
        params = params
    )
    mzML_file = os.path.join(
        params['ftp_output_folder'],
        params['ftp_include_ext'][0]
    )
    if os.path.exists(mzML_file) is False:
        uc.fetch_file(
            engine     = 'get_ftp_files_1_0_0'
        )
    if os.path.exists(params['database']) is False:
        uc.fetch_file(
            engine     = 'get_http_files_1_0_0'
        )

    mgf_file = uc.convert_to_mgf_and_update_rt_lookup(
        input_file       = mzML_file,
    )

    for label in [ '14N', '15N' ]:
        validated_files_list = []
        uc.params['label'] = label
        uc.params['prefix'] = label
        for engine in engine_list:
            unified_result_file = uc.search(
                input_file = mgf_file,
                engine     = engine,
            )

            validated_file = uc.validate(
                input_file = unified_result_file,
                engine     = 'percolator_2_08',
            )

            filtered_file = uc.filter_csv(
                input_file = validated_file,
            )

            validated_files_list.append( filtered_file )
        uc.visualize(
            input_files     = validated_files_list,
            engine          = 'venndiagram',
        )

    return

if __name__ == '__main__':
    main()