# # title: uniprot_fetch_isoforms.py # author: Nick Fitzkee (nfitzkee@chemistry.msstate.edu) # date: January 5, 2023 # summary: Fetch a list of sequences given UniProtKB IDs, along # with isoforms # # To use $> python3 ./uniprot_fetch.py.txt uniprot_ids.txt | tee uniprot.fasta # the above command will make the file "uniprot.fasta" import urllib.request, code, re def open_list(fname): result = [] with open(fname) as uplist: l = uplist.readline() while l: l = l.strip() if not l or l[0] == '#': l = uplist.readline() continue result.append(l) l = uplist.readline() return result def main(fname): uplist = open_list(fname) iso_find = re.compile('ISOFORMS=(\d+)') for up in uplist: # Get the main entry fasta file fasta_url = 'https://rest.uniprot.org/uniprotkb/%s.fasta' % up with urllib.request.urlopen(fasta_url) as page: print(page.read().decode('utf-8')) # Check the entry for an "Alternative Splicing" notation n_isoforms = 0 text_url = 'https://rest.uniprot.org/uniprotkb/%s.txt' % up with urllib.request.urlopen(text_url) as page: #up_entry = page.read().decode('utf-8').split('\n') up_entry = page.read().decode('ascii').split('\n') alt_splice_lines = [x.upper() for x in up_entry if x.upper().find('EVENT=ALTERNATIVE SPLICING;') >= 0] nums = [] if len(alt_splice_lines) == 1: nums = re.findall('ISOFORMS=(\d+)', alt_splice_lines[0]) if len(nums) == 1: n_isoforms = int(nums[0]) if len(alt_splice_lines) > 1 or len(nums) > 1: print('Warning: Multiple isoform entries returned!') #code.interact(local=locals()) for i in range(n_isoforms): fasta_url = 'https://rest.uniprot.org/uniprotkb/%s-%i.fasta' % (up, i+1) with urllib.request.urlopen(fasta_url) as page: print(page.read().decode('utf-8')) if __name__ == '__main__': import os, sys try: fn = sys.argv[1] except: print('usage: %s ' % os.path.split(sys.argv[0])[1]) sys.exit(1) main(fn)