Source code for pangea.contrib.treeoflife.tests.populate_test_db
import gzip
import sys
from time import time
from os import environ
from os.path import join, dirname
from django.core.management.base import BaseCommand
from django.core.exceptions import ObjectDoesNotExist
from pangea.contrib.treeoflife.models import (
TaxonName,
TreeNode,
)
NCBI_DELIM = '\t|' # really...
NAMES_ENV_VAR = 'PANGEA_TREEOFLIFE_NCBI_NAMES'
NODES_ENV_VAR = 'PANGEA_TREEOFLIFE_NCBI_NODES'
TREEOFLIFE_DIR = dirname(dirname(__file__))
NAMES_DEF = join(TREEOFLIFE_DIR, 'ncbi_tree/names.dmp.gz')
NODES_DEF = join(TREEOFLIFE_DIR, 'ncbi_tree/nodes.dmp.gz')
[docs]def tokenize(filehandle):
for i, line in enumerate(filehandle):
line = line.decode('utf-8')
tkns = [tkn.strip() for tkn in line.strip().split(NCBI_DELIM)]
yield i, tkns
[docs]class TaxaTree:
def __init__(self):
self.rank_map = {}
self.parent_map = {}
self.nodes_created = {}
[docs] def create_node_in_db(self, taxon_id):
if taxon_id in self.nodes_created:
return
parent_id = self.parent_map[taxon_id]
if parent_id and parent_id not in self.nodes_created:
try:
self.create_node_in_db(parent_id)
except KeyError:
parent_id = '1' # if parent is missing assign the root as the parent
parent = self.nodes_created[parent_id] if parent_id else None
node = TreeNode(
taxon_id=taxon_id,
parent=parent,
rank=self.rank_map[taxon_id],
)
node.save()
self.nodes_created[taxon_id] = node
[docs] def create_all_nodes_in_db(self):
for i, taxon_id in enumerate(self.rank_map.keys()):
self.create_node_in_db(taxon_id)
[docs] def add_node(self, taxon_id, parent_id, rank):
self.rank_map[taxon_id] = rank
if parent_id == taxon_id: # NCBI has a self loop at the root
parent_id = None
self.parent_map[taxon_id] = parent_id
[docs]def add_nodes(nodes_filename):
tree = TaxaTree()
TreeNode.objects.all().delete()
with gzip.open(nodes_filename) as nodes_file:
for i, tkns in tokenize(nodes_file):
taxon_id, parent_id, rank = tkns[0], tkns[1], tkns[2]
if rank == 'no rank' and int(taxon_id) > 1000:
continue # should filter out strains and lower
if i > (2 * 1000):
break
tree.add_node(taxon_id, parent_id, rank)
tree.create_all_nodes_in_db()
return tree
[docs]def add_names(tree, names_filename):
"""Add names from names_filename to database."""
nodes_created = set(tree.nodes_created.keys())
assert '562' in nodes_created
TaxonName.objects.all().delete()
with gzip.open(names_filename) as names_file:
batch = []
for i, tkns in tokenize(names_file):
if i > (20 * 1000):
break
taxon_id = tkns[0]
if taxon_id not in nodes_created:
continue
batch.append(TaxonName(taxon_id=taxon_id, name=tkns[1], name_type=tkns[3]))
if len(batch) == 1000:
TaxonName.objects.bulk_create(batch)
batch = []
TaxonName.objects.bulk_create(batch)
[docs]def populate_test_db():
try:
TaxonName.objects.get(taxon_id='1')
return
except ObjectDoesNotExist:
pass
names_filename = environ.get(NAMES_ENV_VAR, NAMES_DEF)
nodes_filename = environ.get(NODES_ENV_VAR, NODES_DEF)
tree = add_nodes(nodes_filename)
add_names(tree, names_filename)