45 lines
2.2 KiB
Python
45 lines
2.2 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
""" management command for importing sentences from the Tatoeba's sentences CSV file to the database engine."""
|
||
|
import csv
|
||
|
# if using django in debug mode this will come on handy
|
||
|
from django import db
|
||
|
db.reset_queries()
|
||
|
|
||
|
from itertools import islice
|
||
|
from django.core.management.base import BaseCommand, CommandError
|
||
|
from sentence_finder.models import phrase
|
||
|
|
||
|
class Command(BaseCommand):
|
||
|
help = 'Re-imports phrases from the Tatoeva database. It is needed to have the file sentences.csv in the root of Django (where the file manage.py is located)'
|
||
|
|
||
|
def add_arguments(self, parser):
|
||
|
parser.add_argument('languages', type=str, help='Comma separated languages (ISO 639-3) to import to db (example: spa,rus,eng')
|
||
|
|
||
|
def get_data(self, languages):
|
||
|
""" Helper function to retrieve data from the csv file in a generator, so memory usage will not be a problem.
|
||
|
row[1] contains the language code in ISO 639-3. """
|
||
|
with open("sentences.csv", "r") as csvfile:
|
||
|
for row in csv.reader(csvfile, delimiter="\t"):
|
||
|
if row[1] in languages:
|
||
|
yield phrase(phrase=row[2], language=row[1])
|
||
|
|
||
|
def bulk_create_iter(self, iterable, batch_size=10000):
|
||
|
"""Bulk create supporting generators. Returns only count of created objects."""
|
||
|
created = 0
|
||
|
while True:
|
||
|
objects = phrase.objects.bulk_create(islice(iterable, batch_size))
|
||
|
created += len(objects)
|
||
|
if not objects:
|
||
|
break
|
||
|
return created
|
||
|
|
||
|
def handle(self, *args, **options):
|
||
|
"""Command processor. Receive all arguments and call to the appropiate helper functions."""
|
||
|
languages = options["languages"].split(",")
|
||
|
self.stdout.write("Extracting phrases for the following languages: {langs}".format(langs=languages))
|
||
|
self.stdout.write("Removing data from database...")
|
||
|
# we must delete all previous data as there are no way of Insert_of_update when adding rows in bulk.
|
||
|
phrase.objects.all().delete()
|
||
|
self.stdout.write("Importing new data...")
|
||
|
data = self.get_data(languages)
|
||
|
self.stdout.write(str(self.bulk_create_iter(data, 10000)))
|