Source code for pepper.language.ner

from pepper import logger

from contextlib import contextmanager, closing
from threading import Thread

import socket
import subprocess
import os

from time import sleep


[docs]class NER(object): ROOT = os.path.join(os.path.dirname(__file__), 'stanford-ner') IP = 'localhost' def __init__(self, classifier='english.all.3class.distsim.crf.ser'): self._log = logger.getChild(self.__class__.__name__) self._port = self._find_free_port() self._ner_server_process = None self._ner_server_thread = Thread(target=self._start_server, args=(classifier,)) self._ner_server_thread.daemon = True self._ner_server_thread.start() self._log.debug("Booted: ({}:{})".format(self.IP, self._port))
[docs] def tag(self, text): with self._connect() as s: s.send((text.strip() + '\n').encode('utf-8')) return [ tuple(s.rsplit('/', 1)) for s in self._recv_all(s).replace('\n', '').strip().split(' ') if len(s.rsplit('/', 1)) == 2 ]
[docs] def close(self): self._ner_server_process.kill()
def _start_server(self, classifier): self._ner_server_process = subprocess.Popen([ 'java', '-cp', os.path.join(NER.ROOT, 'stanford-ner.jar'), 'edu.stanford.nlp.ie.NERServer', '-port', str(self._port), '-loadClassifier', os.path.join(NER.ROOT, classifier)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) with self._ner_server_process.stdout: self._log_subprocess_output(self._ner_server_process.stdout) def _log_subprocess_output(self, pipe): for line in iter(pipe.readline, b''): self._log.debug(line.replace('\n', '')) def _find_free_port(self): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(('', 0)) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) return s.getsockname()[1] @contextmanager def _connect(self): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: while sock.connect_ex((self.IP, self._port)): sleep(0.1) yield sock finally: try: sock.shutdown(socket.SHUT_RDWR) except Exception as e: pass finally: sock.close() def _recv_all(self, socket): buffer = bytearray() while True: data = socket.recv(4096) if not data: break buffer.extend(data) return buffer.decode('utf-8') def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close()