Skip to content

Biostumblematic

A biophysicist teaches himself how to code

I’ve written about ElementTree before, and it really is a handy tool. I took the output NCBI GI numbers from my previous post and used them in concert with the ID mapper at UniProt to get a listing of the proteins. UniProt kindly allows you to download this subset in XML, which I did in order to quickly extract the information I was interested in.

Here’s what a bit of the XML looks like:

<?xml version='1.0' encoding='UTF-8'?>
<uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd">
<entry dataset="TrEMBL" created="2000-05-01" modified="2009-03-03" version="30">
<accession>Q9UFQ0</accession>
<name>Q9UFQ0_HUMAN</name>
<protein>
<submittedName>
<fullName>Putative uncharacterized protein DKFZp434K0435</fullName>
</submittedName>
</protein>
<gene>
<name type="primary">DKFZp434K0435</name>
</gene>
<organism key="1">
<name type="scientific">Homo sapiens</name>
<name type="common">Human</name>
<dbReference type="NCBI Taxonomy" id="9606" key="2" />
<lineage>
<taxon>Eukaryota</taxon>
<taxon>Metazoa</taxon>
<taxon>Chordata</taxon>
<taxon>Craniata</taxon>
<taxon>Vertebrata</taxon>
<taxon>Euteleostomi</taxon>
<taxon>Mammalia</taxon>
<taxon>Eutheria</taxon>
<taxon>Euarchontoglires</taxon>
<taxon>Primates</taxon>
<taxon>Haplorrhini</taxon>
<taxon>Catarrhini</taxon>
<taxon>Hominidae</taxon>
<taxon>Homo</taxon>
</lineage>
</organism>
<reference key="3">
<citation type="submission" date="1999-09" db="EMBL/GenBank/DDBJ databases">
<authorList>
<person name="Poustka A." />
<person name="Klein M." />
<person name="Mewes H.W." />
<person name="Gassenhuber J." />
<person name="Wiemann S." />
</authorList>
</citation>
<scope>NUCLEOTIDE SEQUENCE</scope>
<source>
<tissue>Testis</tissue>
</source>
</reference>
<dbReference type="EMBL" id="AL117514" key="4">
<property type="protein sequence ID" value="CAB55973.1" />
<property type="molecule type" value="mRNA" />
</dbReference>
<dbReference type="IPI" id="IPI00798127" key="5" />
<dbReference type="UniGene" id="Hs.520348" key="6" />
<dbReference type="HSSP" id="Q862M4" key="7">
<property type="PDB accession" value="1AAR" />
</dbReference>
<dbReference type="SMR" id="Q9UFQ0" key="8">
<property type="residue range" value="11-86" />
</dbReference>
<dbReference type="IntAct" id="Q9UFQ0" key="9">
<property type="interactions" value="1" />
</dbReference>
<dbReference type="Ensembl" id="ENSG00000150991" key="10">
<property type="organism name" value="Homo sapiens" />
</dbReference>
<dbReference type="HGNC" id="HGNC:12468" key="11">
<property type="gene designation" value="UBC" />
</dbReference>
<dbReference type="HOVERGEN" id="Q9UFQ0" key="12" />
<dbReference type="ArrayExpress" id="Q9UFQ0" key="13" />
<dbReference type="Bgee" id="Q9UFQ0" key="14" />
<dbReference type="GO" id="GO:0006464" key="15">
<property type="term" value="P:protein modification process" />
<property type="evidence" value="IEA:InterPro" />
</dbReference>
<dbReference type="InterPro" id="IPR000626" key="16">
<property type="entry name" value="Ubiquitin" />
</dbReference>
<dbReference type="Pfam" id="PF00240" key="17">
<property type="entry name" value="ubiquitin" />
<property type="match status" value="3" />
</dbReference>
<dbReference type="PRINTS" id="PR00348" key="18">
<property type="entry name" value="UBIQUITIN" />
</dbReference>
<dbReference type="SMART" id="SM00213" key="19">
<property type="entry name" value="UBQ" />
<property type="match status" value="3" />
</dbReference>
<dbReference type="PROSITE" id="PS00299" key="20">
<property type="entry name" value="UBIQUITIN_1" />
<property type="match status" value="3" />
</dbReference>
<dbReference type="PROSITE" id="PS50053" key="21">
<property type="entry name" value="UBIQUITIN_2" />
<property type="match status" value="3" />
</dbReference>
<proteinExistence type="Evidence at transcript level" />
<feature type="non-terminal residue">
<location>
<position position="1" />
</location>
</feature>
<sequence length="239" mass="26873" checksum="88AE150EF8A58366" modified="2000-05-01" version="1" fragment="single">
LHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQL
EDGRTLSDYNIQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKE
GIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVE
PSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGV
</sequence>
</entry>
<copyright>
Copyrighted by the UniProt Consortium, see http://www.uniprot.org/terms
Distributed under the Creative Commons Attribution-NoDerivs License
</copyright>
</uniprot>

This is only one protein from a very long list. There is a lot of information here, but thankfully we can slice it down quickly using ElementTree. Here’s the code I worked up:

#! /usr/bin/env python
###################
# Read data from a Uniprot XML dump
###################

import sys, re, textwrap
import xml.etree.ElementTree as ET

# XML work
ns = '{http://uniprot.org/uniprot}'

tree = ET.parse('infile.xml')

def GetName(self):
    '''Get the name of the protein'''
    try:
        name = protein.find(ns+'protein').find(ns+'recommendedName').findtext(ns+'fullName')
    except:
        name = protein.find(ns+'protein').find(ns+'submittedName').findtext(ns+'fullName')
    return name

def GetGeneName(self):
    '''Get the gene name'''
    try:
        gene_name = protein.find(ns+'gene').findtext(ns+'name')
    except:
        # Have to invent short names for proteins that don't have one
        try:
            fullname = protein.find(ns+'protein').find(ns+'recommendedName').findtext(ns+'fullName')
        except:
            fullname = protein.find(ns+'protein').find(ns+'submittedName').findtext(ns+'fullName')
        gene_name = fullname[0:13]
    return gene_name
    
def GetAccessions(self):
    '''Get all accession codes for this protein'''
    accession_list = []
    accessions = protein.findall(ns+'accession')
    for accession in accessions:
        accession_list.append(accession.text)
    return accession_list

def GetSequence(self):
    '''Retrieve the protein sequence'''
    sequence = protein.findtext(ns+'sequence')
    seq_clean = re.sub("\s+", "", sequence)
    seq_wrapped = textwrap.fill(seq_clean, 60)
    return seq_wrapped
  
def GetDomain(self):
    '''Return domains with start and stop positions'''
    domain_dict = {}
    domain_list = []
    domains = protein.findall(ns+'feature')
    # This loop seems cumbersome
    for feature in domains:
        if feature.get('type') == 'domain':
            domain_list.append(feature)
            for domain in domain_list:
                domain_start = feature.find(ns+'location').find(ns+'begin').get('position')
                domain_stop = feature.find(ns+'location').find(ns+'end').get('position')
                domain_span = domain_start+'-'+domain_stop
                domain_dict[feature.get('description')]= domain_span
    return domain_dict

# Let's do it
for item in tree.getiterator(ns+'uniprot'):
    proteins = item.findall(ns+'entry')
    for protein in proteins:
        print GetName(item)
        print GetGeneName(item)
        print GetAccessions(item)
        print GetSequence(item)
        print GetDomain(item)
        print '------------\n'

Hopefully this is at least a little clear. I read in the tree with

tree = ET.parse('infile.xml')

and then farther down get the individual entries using

for item in tree.getiterator(ns+'uniprot'):
    proteins = item.findall(ns+'entry')

. The program then steps through the various functions defined above to pull out the desired data. These functions should be fairly simple to dissect.

I just run this from the command line with

./uniprot_parse.py > outfile.txt

, and for this protein the results look like the following:

Putative uncharacterized protein DKFZp434K0435
DKFZp434K0435
['Q9UFQ0']
LHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQL
EDGRTLSDYNIQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKE
GIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGMQIFVKTLTGKTITLEVE
PSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGGV
{}
------------
Advertisements

%d bloggers like this: