Skip to content


A biophysicist teaches himself how to code

I’ve written about ElementTree before, and it really is a handy tool. I took the output NCBI GI numbers from my previous post and used them in concert with the ID mapper at UniProt to get a listing of the proteins. UniProt kindly allows you to download this subset in XML, which I did in order to quickly extract the information I was interested in.

Here’s what a bit of the XML looks like:

<?xml version='1.0' encoding='UTF-8'?>
<uniprot xmlns="" xmlns:xsi="" xsi:schemaLocation="">
<entry dataset="TrEMBL" created="2000-05-01" modified="2009-03-03" version="30">
<fullName>Putative uncharacterized protein DKFZp434K0435</fullName>
<name type="primary">DKFZp434K0435</name>
<organism key="1">
<name type="scientific">Homo sapiens</name>
<name type="common">Human</name>
<dbReference type="NCBI Taxonomy" id="9606" key="2" />
<reference key="3">
<citation type="submission" date="1999-09" db="EMBL/GenBank/DDBJ databases">
<person name="Poustka A." />
<person name="Klein M." />
<person name="Mewes H.W." />
<person name="Gassenhuber J." />
<person name="Wiemann S." />
<dbReference type="EMBL" id="AL117514" key="4">
<property type="protein sequence ID" value="CAB55973.1" />
<property type="molecule type" value="mRNA" />
<dbReference type="IPI" id="IPI00798127" key="5" />
<dbReference type="UniGene" id="Hs.520348" key="6" />
<dbReference type="HSSP" id="Q862M4" key="7">
<property type="PDB accession" value="1AAR" />
<dbReference type="SMR" id="Q9UFQ0" key="8">
<property type="residue range" value="11-86" />
<dbReference type="IntAct" id="Q9UFQ0" key="9">
<property type="interactions" value="1" />
<dbReference type="Ensembl" id="ENSG00000150991" key="10">
<property type="organism name" value="Homo sapiens" />
<dbReference type="HGNC" id="HGNC:12468" key="11">
<property type="gene designation" value="UBC" />
<dbReference type="HOVERGEN" id="Q9UFQ0" key="12" />
<dbReference type="ArrayExpress" id="Q9UFQ0" key="13" />
<dbReference type="Bgee" id="Q9UFQ0" key="14" />
<dbReference type="GO" id="GO:0006464" key="15">
<property type="term" value="P:protein modification process" />
<property type="evidence" value="IEA:InterPro" />
<dbReference type="InterPro" id="IPR000626" key="16">
<property type="entry name" value="Ubiquitin" />
<dbReference type="Pfam" id="PF00240" key="17">
<property type="entry name" value="ubiquitin" />
<property type="match status" value="3" />
<dbReference type="PRINTS" id="PR00348" key="18">
<property type="entry name" value="UBIQUITIN" />
<dbReference type="SMART" id="SM00213" key="19">
<property type="entry name" value="UBQ" />
<property type="match status" value="3" />
<dbReference type="PROSITE" id="PS00299" key="20">
<property type="entry name" value="UBIQUITIN_1" />
<property type="match status" value="3" />
<dbReference type="PROSITE" id="PS50053" key="21">
<property type="entry name" value="UBIQUITIN_2" />
<property type="match status" value="3" />
<proteinExistence type="Evidence at transcript level" />
<feature type="non-terminal residue">
<position position="1" />
<sequence length="239" mass="26873" checksum="88AE150EF8A58366" modified="2000-05-01" version="1" fragment="single">
Copyrighted by the UniProt Consortium, see
Distributed under the Creative Commons Attribution-NoDerivs License

This is only one protein from a very long list. There is a lot of information here, but thankfully we can slice it down quickly using ElementTree. Here’s the code I worked up:

#! /usr/bin/env python
# Read data from a Uniprot XML dump

import sys, re, textwrap
import xml.etree.ElementTree as ET

# XML work
ns = '{}'

tree = ET.parse('infile.xml')

def GetName(self):
    '''Get the name of the protein'''
        name = protein.find(ns+'protein').find(ns+'recommendedName').findtext(ns+'fullName')
        name = protein.find(ns+'protein').find(ns+'submittedName').findtext(ns+'fullName')
    return name

def GetGeneName(self):
    '''Get the gene name'''
        gene_name = protein.find(ns+'gene').findtext(ns+'name')
        # Have to invent short names for proteins that don't have one
            fullname = protein.find(ns+'protein').find(ns+'recommendedName').findtext(ns+'fullName')
            fullname = protein.find(ns+'protein').find(ns+'submittedName').findtext(ns+'fullName')
        gene_name = fullname[0:13]
    return gene_name
def GetAccessions(self):
    '''Get all accession codes for this protein'''
    accession_list = []
    accessions = protein.findall(ns+'accession')
    for accession in accessions:
    return accession_list

def GetSequence(self):
    '''Retrieve the protein sequence'''
    sequence = protein.findtext(ns+'sequence')
    seq_clean = re.sub("\s+", "", sequence)
    seq_wrapped = textwrap.fill(seq_clean, 60)
    return seq_wrapped
def GetDomain(self):
    '''Return domains with start and stop positions'''
    domain_dict = {}
    domain_list = []
    domains = protein.findall(ns+'feature')
    # This loop seems cumbersome
    for feature in domains:
        if feature.get('type') == 'domain':
            for domain in domain_list:
                domain_start = feature.find(ns+'location').find(ns+'begin').get('position')
                domain_stop = feature.find(ns+'location').find(ns+'end').get('position')
                domain_span = domain_start+'-'+domain_stop
                domain_dict[feature.get('description')]= domain_span
    return domain_dict

# Let's do it
for item in tree.getiterator(ns+'uniprot'):
    proteins = item.findall(ns+'entry')
    for protein in proteins:
        print GetName(item)
        print GetGeneName(item)
        print GetAccessions(item)
        print GetSequence(item)
        print GetDomain(item)
        print '------------\n'

Hopefully this is at least a little clear. I read in the tree with

tree = ET.parse('infile.xml')

and then farther down get the individual entries using

for item in tree.getiterator(ns+'uniprot'):
    proteins = item.findall(ns+'entry')

. The program then steps through the various functions defined above to pull out the desired data. These functions should be fairly simple to dissect.

I just run this from the command line with

./ > outfile.txt

, and for this protein the results look like the following:

Putative uncharacterized protein DKFZp434K0435

%d bloggers like this: