Locked History Actions

Events/GCC2012/TrainingDay/WS1

Slides are available here: Slides.

maf_slice.py:

"""
Modified from interval2maf.py for teaching workshops

Reads a list of intervals and a maf. Produces a new maf containing the
blocks or parts of blocks in the original that overlapped the intervals.

If a MAF file, not UID, is provided the MAF file is indexed before being processed.

NOTE: If two intervals overlap the same block it will be written twice.

usage: %prog maf_file [options]
   -d, --dbkey=d: Database key, ie hg17
   -c, --chromCol=c: Column of Chr
   -s, --startCol=s: Column of Start
   -e, --endCol=e: Column of End
   -S, --strandCol=S: Column of Strand
   -t, --mafType=t: Type of MAF source to use
   -m, --mafFile=m: Path of source MAF file, if not using cached version
   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
   -i, --interval_file=i:       Input interval file
   -o, --output_file=o:      Output MAF file
   -p, --species=p: Species to include in output
   -P, --split_blocks_by_species=P: Split blocks by species
   -r, --remove_all_gap_columns=r: Remove all Gap columns
   -l, --indexLocation=l: Override default maf_index.loc file
   -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc )
"""

#Dan Blankenberg
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
from bx.cookbook import doc_optparse
import bx.align.maf
import bx.intervals.io
from galaxy.tools.util import maf_utilities
import sys

assert sys.version_info[:2] >= ( 2, 4 )

def __main__():
    index = index_filename = None
    mincols = 0
    
    #Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    
    if options.dbkey: dbkey = options.dbkey
    else: dbkey = None
    if dbkey in [None, "?"]:
        maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
    
    species = maf_utilities.parse_species_option( options.species )
    
    if options.chromCol: chromCol = int( options.chromCol ) - 1
    else: 
        chromCol = 0
    
    if options.startCol: startCol = int( options.startCol ) - 1
    else: 
        startCol = 1
    
    if options.endCol: endCol = int( options.endCol ) - 1
    else: 
        endCol = 2
    
    if options.strandCol: strandCol = int( options.strandCol ) - 1
    else: 
        strandCol = -1
    
    if options.interval_file: interval_file = options.interval_file
    else: 
        maf_utilities.tool_fail( "Input interval file has not been specified." )
    
    if options.output_file: output_file = options.output_file
    else: 
        maf_utilities.tool_fail( "Output file has not been specified." )
    
    split_blocks_by_species = remove_all_gap_columns = False
    if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species':
        split_blocks_by_species = True
        if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns':
            remove_all_gap_columns = True
    else:
        remove_all_gap_columns = True
    #Finish parsing command line
    
    #Open indexed access to MAFs
    if options.mafType:
        if options.indexLocation:
            index = maf_utilities.maf_index_by_uid( options.mafType, options.indexLocation )
        else:
            index = maf_utilities.maf_index_by_uid( options.mafType, options.mafIndexFile )
        if index is None:
            maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) )
    elif options.mafFile:
        index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species = [dbkey] )
        if index is None:
            maf_utilities.tool_fail( "Your MAF file appears to be malformed." )
    else:
        maf_utilities.tool_fail( "Desired source MAF type has not been specified." )
    
    #Create MAF writter
    out = bx.align.maf.Writer( open(output_file, "w") )
    
    #Iterate over input regions 
    num_blocks = 0
    num_regions = None
    for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chromCol, start_col = startCol, end_col = endCol, strand_col = strandCol, fix_strand = True, return_header = False, return_comments = False ) ):
        src = maf_utilities.src_merge( dbkey, region.chrom )
        for block in index.get_as_iterator( src, region.start, region.end ):
            if split_blocks_by_species:
                blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species( block ) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start( dbkey ), region ) ]
            else:
                blocks = [ block ]
            for block in blocks:
                block = maf_utilities.chop_block_by_region( block, src, region )
                if block is not None:
                    if species is not None:
                        block = block.limit_to_species( species )
                    block = maf_utilities.orient_block_by_region( block, src, region )
                    if remove_all_gap_columns:
                        block.remove_all_gap_columns()
                    out.write( block )
                    num_blocks += 1
    
    #Close output MAF
    out.close()
    
    #remove index file if created during run
    maf_utilities.remove_temp_index_file( index_filename )
    
    if num_blocks:
        print "%i MAF blocks extracted for %i regions." % ( num_blocks, ( num_regions + 1 ) )
    elif num_regions is not None:
        print "No MAF blocks could be extracted for %i regions." % ( num_regions + 1 )
    else:
        print "No valid regions have been provided."
    
if __name__ == "__main__": __main__()

tool_conf.xml section:

<toolbox>
  <section name="Workshop Demo" id="workshop_demo">
    <tool file="demo/maf_slice.xml" />
  </section>
</toolbox>

End of Exercise, adding a basic tool:

<tool id="maf_slice" name="Slice MAF" version="1.0.0">
  <description>by intervals</description>
  <command interpreter="python">
maf_slice.py --dbkey=hg17 --mafFile=${maf_input} --interval_file=${interval_input}  --output_file=${maf_output}
   </command>
   <inputs>
    <param format="maf" name="maf_input" label="Choose alignments" type="data"/>
    <param format="bed" name="interval_input" type="data" label="Choose intervals"/>
   </inputs>
   <outputs>
     <data format="maf" name="maf_output"/>
   </outputs>
   <tests>
   </tests>
   <help>
  </help>
</tool>

End of Exercise adding metadata elements to tool:

<tool id="maf_slice" name="Slice MAF" version="1.0.1">
  <description>by intervals</description>
  <command interpreter="python">
maf_slice.py --dbkey=${interval_input.dbkey} --mafFile=${maf_input} --interval_file=${interval_input}  --output_file=${maf_output}
     --chromCol=${interval_input.metadata.chromCol} --startCol=${interval_input.metadata.startCol} --endCol=${interval_input.metadata.endCol} --strandCol=${interval_input.metadata.strandCol}
     --mafIndex=${maf_input.metadata.maf_index}
   </command>
   <inputs>
    <param format="maf" name="maf_input" label="Choose alignments" type="data"/>
    <param format="interval" name="interval_input" type="data" label="Choose intervals"/>
   </inputs>
   <outputs>
     <data format="maf" name="maf_output"/>
   </outputs>
   <tests>
   </tests>
   <help>
  </help>
</tool>

End of Exercise, adding conditional:

<tool id="maf_slice" name="Slice MAF" version="1.0.2">
  <description>by intervals</description>
  <command interpreter="python">
maf_slice.py --dbkey=${interval_input.dbkey} --mafFile=${maf_source_type.maf_input} --interval_file=${interval_input}  --output_file=${maf_output}
     --chromCol=${interval_input.metadata.chromCol} --startCol=${interval_input.metadata.startCol} --endCol=${interval_input.metadata.endCol} --strandCol=${interval_input.metadata.strandCol}
     --mafIndex=${maf_source_type.maf_input.metadata.maf_index}
   </command>
   <inputs>
    <param format="interval" name="interval_input" type="data" label="Choose intervals"/>
    <conditional name="maf_source_type">
      <param name="maf_source" type="select" label="MAF Source">
        <option value="cached" selected="true">Locally Cached Alignments</option>
        <option value="user">Alignments in Your History</option>
      </param>
      <when value="user">
        <param format="maf" name="maf_input" label="Choose alignments" type="data"/>
      </when>
      <when value="cached">
        <!-- need some way to access the external data -->
      </when>
    </conditional>
   </inputs>
   <outputs>
     <data format="maf" name="maf_output"/>
   </outputs>
   <tests>
   </tests>
   <help>
  </help>
</tool>

End of Exercise, accessing .loc file:

<tool id="maf_slice" name="Slice MAF" version="1.0.3">
  <description>by intervals</description>
  <command interpreter="python">
     maf_slice.py --dbkey=${interval_input.dbkey} --interval_file=${interval_input}  --output_file=${maf_output}
     --chromCol=${interval_input.metadata.chromCol} --startCol=${interval_input.metadata.startCol} --endCol=${interval_input.metadata.endCol} --strandCol=${interval_input.metadata.strandCol}
     #if $maf_source_type.maf_source == "user":
         --mafFile=${maf_source_type.maf_input}
         --mafIndex=${maf_source_type.maf_input.metadata.maf_index}
     #else:
         --mafType=$maf_source_type.mafType
         --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc
     #end if
   </command>
   <inputs>
    <param format="interval" name="interval_input" type="data" label="Choose intervals"/>
    <conditional name="maf_source_type">
      <param name="maf_source" type="select" label="MAF Source">
        <option value="cached" selected="true">Locally Cached Alignments</option>
        <option value="user">Alignments in Your History</option>
      </param>
      <when value="user">
        <param format="maf" name="maf_input" label="Choose alignments" type="data"/>
      </when>
      <when value="cached">
        <param name="mafType" type="select" label="Choose alignments">
          <options from_data_table="indexed_maf_files">
            <filter type="data_meta" ref="interval_input" key="dbkey" column="dbkey" multiple="True" separator=","/>
            <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
          </options>
        </param>
      </when>
    </conditional>
   </inputs>
   <outputs>
     <data format="maf" name="maf_output"/>
   </outputs>
   <tests>
   </tests>
   <help>
  </help>
</tool>

End of Exercise, selecting species from metadata values for MAF in history:

<tool id="maf_slice" name="Slice MAF" version="1.0.4">
  <description>by intervals</description>
  <command interpreter="python">
     maf_slice.py --dbkey=${interval_input.dbkey} --interval_file=${interval_input}  --output_file=${maf_output}
     --chromCol=${interval_input.metadata.chromCol} --startCol=${interval_input.metadata.startCol} --endCol=${interval_input.metadata.endCol} --strandCol=${interval_input.metadata.strandCol}
     #if $maf_source_type.maf_source == "user":
         --mafFile=${maf_source_type.maf_input}
         --mafIndex=${maf_source_type.maf_input.metadata.maf_index}
         --species=${maf_source_type.species}
     #else:
         --mafType=$maf_source_type.mafType
         --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc
     #end if
   </command>
   <inputs>
    <param format="interval" name="interval_input" type="data" label="Choose intervals"/>
    <conditional name="maf_source_type">
      <param name="maf_source" type="select" label="MAF Source">
        <option value="cached" selected="true">Locally Cached Alignments</option>
        <option value="user">Alignments in Your History</option>
      </param>
      <when value="user">
        <param format="maf" name="maf_input" label="Choose alignments" type="data"/>
        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
          <options>
            <filter type="data_meta" ref="maf_input" key="species" />
          </options>
        </param>
      </when>
      <when value="cached">
        <param name="mafType" type="select" label="Choose alignments">
          <options from_data_table="indexed_maf_files">
            <filter type="data_meta" ref="interval_input" key="dbkey" column="dbkey" multiple="True" separator=","/>
            <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
          </options>
        </param>
      </when>
    </conditional>
   </inputs>
   <outputs>
     <data format="maf" name="maf_output"/>
   </outputs>
   <tests>
   </tests>
   <help>
  </help>
</tool>

End of exercise, selecting species available from .loc file:

<tool id="maf_slice" name="Slice MAF" version="1.0.5">
  <description>by intervals</description>
  <command interpreter="python">
     maf_slice.py --dbkey=${interval_input.dbkey} --interval_file=${interval_input}  --output_file=${maf_output}
     --chromCol=${interval_input.metadata.chromCol} --startCol=${interval_input.metadata.startCol} --endCol=${interval_input.metadata.endCol} --strandCol=${interval_input.metadata.strandCol}
     --species=${maf_source_type.species}
     #if $maf_source_type.maf_source == "user":
         --mafFile=${maf_source_type.maf_input}
         --mafIndex=${maf_source_type.maf_input.metadata.maf_index}
     #else:
         --mafType=$maf_source_type.mafType
         --mafIndexFile=${GALAXY_DATA_INDEX_DIR}/maf_index.loc
     #end if
   </command>
   <inputs>
    <param format="interval" name="interval_input" type="data" label="Choose intervals"/>
    <conditional name="maf_source_type">
      <param name="maf_source" type="select" label="MAF Source">
        <option value="cached" selected="true">Locally Cached Alignments</option>
        <option value="user">Alignments in Your History</option>
      </param>
      <when value="user">
        <param format="maf" name="maf_input" label="Choose alignments" type="data"/>
        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
          <options>
            <filter type="data_meta" ref="maf_input" key="species" />
          </options>
        </param>
      </when>
      <when value="cached">
        <param name="mafType" type="select" label="Choose alignments">
          <options from_data_table="indexed_maf_files">
            <filter type="data_meta" ref="interval_input" key="dbkey" column="dbkey" multiple="True" separator=","/>
            <validator type="no_options" message="No alignments are available for the build associated with the selected interval file"/>
          </options>
        </param>
        <param name="species" type="select" display="checkboxes" multiple="true" label="Choose species" help="Select species to be included in the final alignment">
          <options from_data_table="indexed_maf_files">
            <column name="uid" index="1"/>
            <column name="value" index="3"/>
            <column name="name" index="3"/>
            <filter type="param_value" ref="mafType" column="uid"/>
            <filter type="multiple_splitter" column="name" separator=","/>
          </options>
        </param>
      </when>
    </conditional>
   </inputs>
   <outputs>
     <data format="maf" name="maf_output"/>
   </outputs>
   <tests>
   </tests>
   <help>
  </help>
</tool>