DEV Community

Vee Satayamas
Vee Satayamas

Posted on

3 3

ลอง wordcutpy บน pypy3

3 มีนาคม 2562

พอทดสอบกับไฟล์ 11MB ใช้ pypy3 ทำให้ wordcutpy เร็วขึ้นเกิน 2 เท่า! คือใช้เวลาจาก 16 วินาที เหลือไม่ถึง 8 วินาที

(base) [vee@mint310 wiki]$ python3 wordcutpy.py 
16598
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7833
(base) [vee@mint310 wiki]$ python3 wordcutpy.py 
16093
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7821
(base) [vee@mint310 wiki]$ python3 wordcutpy.py 
16272
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7810
Enter fullscreen mode Exit fullscreen mode


`

`

# wordcutpy.py
# การใช้ wordcutpy ที่ถูกต้องคือ copy & paste เลย ไม่ต้องใช้ pip 😅
# แล้วก็ copy bigthai.txt มาไว้ folder เดียวกัน

import sys
import re

class PrefixTree(object):
    def __init__(self, members_with_payload):
        self.tab = {}
        if members_with_payload is None:
            return 
        sorted_members_with_payload = sorted(members_with_payload,
                                             key=lambda i: i[0])

        for i in range(len(sorted_members_with_payload)):
            members, payload = sorted_members_with_payload[i]
            row_no = 0
            for j in range(len(members)):
                is_terminal = len(members) == j + 1
                member = members[j]
                key = (row_no, j, member)
                if key in self.tab:
                    row_no = self.tab[key][0]
                else:
                    val = (i, is_terminal, payload if is_terminal else None)
                    self.tab[key] = val
                    row_no = i

    def lookup(self, i, offset, member):
        key = (i, offset, member)
        if key not in self.tab:
            return None
        return self.tab[key]

UNK   = 1
DICT  = 2
INIT  = 3
LATIN = 4
PUNC  = 5

def is_better(link0, link1):
    if link0 is None:
        return True

    if link1["unk"] < link0["unk"]:
        return True

    if link1["w"] < link0["w"]:
        return True

    return False

def build_path(dix, s):
    left_boundary = 0
    dict_acc_list = []

    path = [{"p":None, "w": 0, "unk": 0, "type": INIT}]

    latin_s = None
    latin_e = None

    punc_s = None
    punc_e = None

    for i, ch in enumerate(s):
        dict_acc_list.append({"s":i, "p":0, "final":False})

        # Update dict acceptors
        _dict_acc_list = dict_acc_list
        dict_acc_list = []                        
        for acc in _dict_acc_list:
            offset = i - acc["s"]
            child = dix.lookup(acc["p"], offset, ch)
            if child is not None:
                child_p, is_final, payload = child
                dict_acc_list.append({"s":acc["s"], "p": child_p,
                                      "final":is_final})

        # latin words
        if latin_s is None:
            if re.match(u"[A-Za-z]", ch):
                latin_s = i

        if latin_s is not None:            
            if re.match(u"[A-Za-z]", ch):
                if i + 1 == len(s) or re.match(u"[A-Za-z]", s[i + 1]):
                    latin_e = i
            else:
                latin_s = None
                latin_e = None

        # puncuation
        if punc_s is None:
            if ch == " ":
                punc_s = i

        if punc_s is not None:
            if ch == " ":
                if len(s) == i + 1 or s[i + 1] != " ":
                    punc_e = i
            else:
                punc_s = None
                punc_e = None

        # select link
        link = None

        # links from wordlist
        for acc in dict_acc_list:
            if acc["final"]:
                p_link = path[acc["s"]]
                _link = {"p": acc["s"], 
                         "w": p_link["w"] + 1, 
                         "unk": p_link["unk"],
                         "type": DICT}
                if is_better(link, _link):
                    link = _link

        # link from latin word
        if latin_s is not None and latin_e is not None:
            p_link = path[latin_s]
            _link = {"p": latin_s, 
                     "w": p_link["w"] + 1, 
                     "unk": p_link["unk"],
                     "type": LATIN}
            if is_better(link, _link):
                link = _link

        # link from puncuation
        if punc_s is not None and punc_e is not None:                
            p_link = path[punc_s]
            _link = {"p": punc_s, 
                     "w": p_link["w"] + 1, 
                     "unk": p_link["unk"],
                     "type": PUNC}
            if is_better(link, _link):
                link = _link

        # fallback
        if link is None:
            p_link = path[left_boundary]
            link = {"p": left_boundary, 
                    "w": p_link["w"] + 1,
                    "unk": p_link["unk"] + 1,
                    "type": UNK}
        path.append(link)
        if link["type"] != UNK:
            left_boundary = i
    return path

def path_to_tokens(txt, path):
    if len(path) < 2:
        return None

    e = len(path) - 1
    toks = []

    while True:
        link = path[e]
        s = link["p"]
        if s is None:
            break
        toks.append(txt[s:e])
        e = s

    toks.reverse()
    return toks

def tokenize(dix, txt):
    if txt is None or txt == "":
        return []
    path = build_path(dix, txt)
    return path_to_tokens(txt, path)

class Wordcut(object):
    def __init__(self, wordlist):
        self.dix = PrefixTree([(word, None) for word in wordlist])


    @classmethod
    def bigthai(cls):
        import os
        "Initialize from bigthai"
        fileDir =  os.path.dirname(__file__)
        filename = os.path.join(fileDir, 'bigthai.txt')
        with open(filename) as dict_file:

            word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
            word_list.sort()
            return cls(word_list)

    def tokenize(self, s):
        return tokenize(self.dix, s)

wordcut = Wordcut.bigthai()

import time

t1 = int(round(time.time() * 1000))

with open("wiki_plain_100k.txt") as fi:
    with open("wiki.cut", "w") as fo:
        for line in fi:
            line = line.strip()
            print(" ".join(wordcut.tokenize(line)), file=fo)

t2 = int(round(time.time() * 1000))

print(t2-t1)

# LICENSE: LGPLv3

Enter fullscreen mode Exit fullscreen mode


`

https://github.com/veer66/wordcutpy

Heroku

This site is built on Heroku

Join the ranks of developers at Salesforce, Airbase, DEV, and more who deploy their mission critical applications on Heroku. Sign up today and launch your first app!

Get Started

Top comments (0)

Sentry image

See why 4M developers consider Sentry, “not bad.”

Fixing code doesn’t have to be the worst part of your day. Learn how Sentry can help.

Learn more

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay