Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modifies SequenceProfile.py allow functionality for multiple chains w… #96

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 42 additions & 26 deletions protein_tools/scripts/SequenceProfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
# (c) addressed to University of Washington CoMotion, email: [email protected].

## Authors: Florian Richter
## Modified by Nick Randolph (Jan 2021)

################################################################################################
# Sequence profile for a list of structures vs a template
# To get the % of each residue at each site:
# SequenceProfile.py
# -l is the input list
# -t is the template to align to
# python ~/install/perl/tools/SequenceProfile.py -l list.txt -t original_structure.pdb
# -c is the ID of chain of interest if multiple chains present
# NOTE: IF LOOKING FOR PROFILE OF MULTIPLE CHAINS, RESULTS MIGHT NOT BE ACCURATE! RUN SCRIPT
# MULTIPLE TIMES USING -c FOR EACH INDIVIDUAL CHAIN!
# python ~/install/perl/tools/SequenceProfile.py -l list.txt -t original_structure.pdb -c chainID
################################################################################################

import sys
Expand Down Expand Up @@ -71,7 +75,7 @@ def AA3LetTo1Let(aa):



def get_coordinates(struct):
def get_coordinates(struct,ChainID):

struct = struct.replace("\n","")
struct_file = open(struct,'r')
Expand All @@ -92,17 +96,18 @@ def get_coordinates(struct):

if atom_reading_flag:
cols = line.split()
if cur_res != int(line[23:26]):
if cur_res != -50000:
all_coordinates[cur_res] = cur_coordinates
cur_res = int(line[23:26])
cur_coordinates = {}
cur_coordinates['type'] = cols[3]
cur_coordinates['chain'] = line[21:22]
cur_coordinates['active'] = 1 #keep track of whether we count this residue
cur_coordinates[cols[2]] = (float(line[31:38]), float(line[39:46]), float(line[47:54]))
# elif (not ca_only) and line[13:14] != 'H':
# cur_coordinates[cols[2]] = (float(line[31:38]), float(line[39:46]), float(line[47:54]))
if ( cols[4] == ChainID ) or ( ChainID == ''):
if cur_res != int(cols[5]):
if cur_res != -50000:
all_coordinates[cur_res] = cur_coordinates
cur_res = int(cols[5])
cur_coordinates = {}
cur_coordinates['type'] = cols[3]
cur_coordinates['chain'] = cols[4]
cur_coordinates['active'] = 1 #keep track of whether we count this residue
cur_coordinates[cols[2]] = (float(cols[6]), float(cols[7]), float(cols[8]))
# elif (not ca_only) and line[13:14] != 'H':
# cur_coordinates[cols[2]] = (float(line[31:38]), float(line[39:46]), float(line[47:54]))

return all_coordinates

Expand All @@ -111,18 +116,20 @@ class SequenceProfile:

def __init__(self, res_input):
self.wt_pos = []
self.num_sequences = 0
self.res_id = []
self.num_sequences = 0
self.mutations = {}

for res in res_input:
self.wt_pos.append( AA3LetTo1Let(res_input[res]['type']) )
self.res_id.append( res )


def add_struct( self, res_coords ):

if len( res_coords ) != len( self.wt_pos ):
print "Error: input structure and wt structure doesn't have the same size"
print "res coords:", len (res_coords), "wt_pos", len (self.wt_pos)
print("Error: input structure and wt structure doesn't have the same size")
print("res coords:", len(res_coords), "wt_pos", len(self.wt_pos))
sys.exit()
i = -1
self.num_sequences = self.num_sequences + 1
Expand All @@ -144,9 +151,14 @@ def add_struct( self, res_coords ):
def get_outstring( self ):

outstring = ""
for i in range( len( self.wt_pos ) ):
firstMut = 0
for i in range( len( self.wt_pos ) ):
if self.mutations.has_key( i ):
outstring = outstring + self.wt_pos[i] + str(i+1) + ": "
if firstMut == 0:
outstring = "RosettaRes (pdbRes): % Mut\n"
firstMut = firstMut + 1

outstring = outstring + self.wt_pos[i] + str(i+1) + " (" + self.wt_pos[i] + str(self.res_id[i]) + "): "
for res in self.mutations[ i ]:

freq = float(self.mutations[ i ][res]) / float( self.num_sequences )
Expand All @@ -168,6 +180,7 @@ def get_outstring( self ):
outfile = ""
Singlefile = ''
listmode = 0
ChainID = ''

CommandArgs = sys.argv[1:]

Expand All @@ -181,34 +194,37 @@ def get_outstring( self ):
outfile = CommandArgs[CommandArgs.index(arg)+1]
elif arg == '-s':
Singlefile = CommandArgs[CommandArgs.index(arg)+1]
elif arg == '-c':
ChainID = CommandArgs[CommandArgs.index(arg)+1]



if ( (not Listfile and not Singlefile) or (not template) ):
print """
Usage: python ~/install/perl/tools/SequenceProfile.py -l list.txt -t original_structure.pdb
print("""
Usage: python ~/install/perl/tools/SequenceProfile.py -l list.txt -t original_structure.pdb -c chainID

list.txt is a list of pdb files to be compared (e.g. ls *_DE_1.pdb > list.txt)
original_structure.pdb is the sequence against which all others are compared, often the original pdb, or the input to design
chainID is the chain of interest for the profile

Note that the script only takes a monomer, it can't handle multiple chains. In those cases you might consider pulling out just the designed chain from all of your structures and running the script on those chains alone.
"""
Note that the script cannot profile multiple chains. In those cases you will want to rerun the script multiple times, changing the chainID for each designed chain.
""")
sys.exit()
elif(listmode):
inlist = open(Listfile,'r')
FileList = inlist.readlines()
inlist.close()
if outfile == ' ':
outfile = Listfile + '.ana'
print "Checking structures for %s structures in %s to template %s" % (len(FileList), Listfile, template)
print("Checking structures for %s structures in %s to template %s" % (len(FileList), Listfile, template))

else:
FileList.append(Singlefile)

template_coords = {}

if template != '':
template_coords = get_coordinates(template)
template_coords = get_coordinates(template,ChainID)

seq_prof = SequenceProfile( template_coords )

Expand All @@ -218,7 +234,7 @@ def get_outstring( self ):
for struct in FileList:

if len(struct)>1: # make sure this is not an empty line
struct_coords = get_coordinates(struct)
struct_coords = get_coordinates(struct,ChainID)

seq_prof.add_struct( struct_coords )

Expand All @@ -229,7 +245,7 @@ def get_outstring( self ):


if outfile == "":
print outstring
print(outstring)
#print pymutstring

else:
Expand Down