| |
| import json |
| from fastapi import APIRouter, Response |
| from fastapi.responses import JSONResponse |
| from pythainlp.tokenize import ( |
| word_tokenize as py_word_tokenize, |
| subword_tokenize as py_subword_tokenize, |
| sent_tokenize as py_sent_tokenize |
| ) |
| from enum import Enum |
| from typing import List, Optional |
| from pydantic import BaseModel |
|
|
| router = APIRouter() |
|
|
|
|
| class SentTokenizeEngine(str, Enum): |
| whitespace = "whitespace" |
| whitespace_newline = "whitespace+newline" |
| crfcut = "crfcut" |
|
|
|
|
| class WordTokenizeEngine(str, Enum): |
| newmm = "newmm" |
| longest = "longest" |
| tltk = "tltk" |
|
|
|
|
| class SubwordTokenizeEngine(str, Enum): |
| tcc = "tcc" |
| etcc = "etcc" |
| ssg = "ssg" |
| tltk = "tltk" |
|
|
| class WordTokenizeResponse(BaseModel): |
| words: List[str] = [] |
|
|
| class SubwordTokenizeResponse(BaseModel): |
| subwords: List[str] = [] |
|
|
| class SentTokenizeEngine(BaseModel): |
| sents: List[str] = [] |
|
|
| @router.post('/word_tokenize', response_model=WordTokenizeResponse) |
| def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): |
| """ |
| Word tokenize or word segmentation for Thai language |
| |
| ## Input |
| |
| - **text**: Text that want to tokenize. |
| - **engine**: Word Tokenize Engine (default is newmm) |
| """ |
| return JSONResponse( |
| {"words": py_word_tokenize(text=text, engine=engine)}, |
| media_type="application/json; charset=utf-8", |
| ) |
|
|
|
|
| @router.post('/subword_tokenize', response_model=SubwordTokenizeResponse) |
| def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): |
| """ |
| Subword tokenize or subword segmentation for Thai language |
| |
| ## Input |
| |
| - **text**: Text that want to tokenize. |
| - **engine**: Sub word Tokenize Engine (default is tcc) |
| """ |
| return JSONResponse( |
| {"subwords": py_subword_tokenize(text=text, engine=engine)}, |
| media_type="application/json; charset=utf-8", |
| ) |
|
|
|
|
| @router.post('/sent_tokenize', response_model=SentTokenizeEngine) |
| def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"): |
| """ |
| Thai sentence segmentation |
| |
| ## Input |
| |
| - **text**: Text that want to tokenize. |
| - **engine**: Sentence Tokenize Engine (default is crfcut) |
| """ |
| return JSONResponse( |
| {"sents": py_sent_tokenize(text=text, engine=engine)}, |
| media_type="application/json; charset=utf-8", |
| ) |
|
|