Skip to main content

A pre-processing tool for NLP.

Project description

pnlp

This is a pre-processing tool for NLP.

Features

  • a flexible pipe line for text io
  • a flexible tool for text clean and extract
  • Sentence cut and Chinese character cut
  • Chinese character normalization
  • kinds of length
  • stopwords
  • some magic usage in pre-processing

Install

pip install pnlp

Usage

Iopipe

IO process

tree tests/piop_data/
├── a.md
├── b.txt
├── c.data
├── first
│   ├── fa.md
│   ├── fb.txt
│   ├── fc.data
│   └── second
│       ├── sa.md
│       ├── sb.txt
│       └── sc.data
├── json.json
├── outfile.file
├── outjson.json
└── yml.yml
import os
from pnlp import Reader

DATA_PATH = "./pnlp/tests/piop_data/"
pattern = '*.md' # also could be '*.txt', 'f*.*', etc.
reader = Reader(pattern)

# Get lines of all files in one directory with line index and file name
for line in reader(DATA_FOLDER_PATH):
    print(line.lid, line.fname, line.text)
"""
0 a.md line 1 in a.
1 a.md line 2 in a.
2 a.md line 3 in a.
0 fa.md line 1 in fa.
1 fa.md line 2 in fa
...
"""

# Get lines of one file lines with line index and file name
# if a file is read, the `pattern` is not effective
for line in reader(os.path.join(DATA_FOLDER_PATH, "a.md")):
    print(line.lid, line.fname, line.text)
"""
0 a.md line 1 in a.
1 a.md line 2 in a.
2 a.md line 3 in a.
"""

# Get all filepaths in one directory
for path in reader.gen_files(DATA_PATH, pattern):
    print(path)
"""
pnlp/tests/piop_data/a.md
pnlp/tests/piop_data/first/fa.md
pnlp/tests/piop_data/first/second/sa.md
"""

# Get content(article) of all files in one directory with file name
paths = reader.gen_files(DATA_PATH, pattern)
articles = reader.gen_articles(paths)
for article in articles:
    print(article.fname)
    print(article.f.read())
"""
a.md
line 1 in a.
line 2 in a.
line 3 in a.
...
"""

# Get lines of all files in one directory with line index and file name
# the same as ip.Reader(DATA_PATH, pattern)
paths = reader.gen_files(DATA_PATH, pattern)
articles = reader.gen_articles(paths)
for line in reader.gen_flines(articles):
    print(line.lid, line.fname, line.text)

Built-in Method

import pnlp

# Read
file_string = pnlp.read_file(file_path)
file_list = pnlp.read_lines(file_path)
file_json = pnlp.read_json(file_path)
file_yaml = pnlp.read_yaml(file_path)
file_csv = pnlp.read_csv(file_path)

# Write
pnlp.write_json(file_path, data)
pnlp.write_file(file_path, data)

# Others
pnlp.check_dir(dirname) # will make dirname if not exist

Text

Clean and Extract

import re

# Use Text
from pnlp import Text

text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233."
pattern = re.compile(r'\d+')

# pattern is re.Pattern or str type
# Default is '', means do not use any pattern (acctually is re.compile(r'.+'). In this pattern, clean returns nothing, extract returns the origin.
# If pattern is a string, a build-in pattern will be used, there are 11 types:
#	'chi': Chinese character
#	'pun': Punctuations
#	'whi': White space
#	'nwh': Non White space
#	'wnb': Word and number
#	'nwn': Non word and number
#	'eng': English character
#	'num': Number
#	'pic': Pictures
#	'lnk': Links
#	'emj': Emojis

pt = Text(['chi', pattern])
# pt.extract will return matches and their locations
res = pt.extract(text)

print(res)
"""
{'text': '这是长度测试233', 'mats': ['这是', '长度测试', '233'], 'locs': [(0, 2), (22, 26), (60, 63)]}
"""

print(res.text, res.mats, res.locs)
"""
'这是长度测试' ['这是', '长度测试'] [(0, 2), (22, 26)]
"""
# pt.clean will return cleaned text using the pattern
print(pt.clean(text))
"""
https://www.yam.gift,《 》*)FSJfdsjf😁![](http://xx.jpg)。233.
"""

pt = Text(['pic', 'lnk'])
res = pt.extract(text)

print(res.mats)
"""
['https://www.yam.gif',
 '![](http://xx.jpg)',
 'https://www.yam.gift',
 'http://xx.jpg']
"""

print(pt.clean(text))
"""
这是t长度测试,《 》*)FSJfdsjf😁。233.
"""

Regex

# USE Regex
from pnlp import Regex
reg = Regex()
def clean_text(text: str) -> str:
    text = reg.pwhi.sub("", text)
    text = reg.pemj.sub("", text)
    text = reg.ppic.sub("", text)
    text = reg.plnk.sub("", text)
    return text

Cut

AnypartCut

# Cut by Regex
from pnlp import cut_part, psent
text = "你好!欢迎使用。"
sent_list = cut_part(text, psent, with_spliter=True, with_offset=False)
print(sent_list)
"""
['你好!', '欢迎使用。']
"""
pcustom_sent = re.compile(r'[。!]')
sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=False)
print(sent_list)
"""
['你好', '欢迎使用']
"""
sent_list = cut_part(text, pcustom_sent, with_spliter=False, with_offset=True)
print(sent_list)
"""
[('你好', 0, 3), ('欢迎使用', 3, 8)]
"""

SentenceCut

# Cut Sentence
from pnlp import cut_sentence as pcs
text = "你好!欢迎使用。"
sent_list = pcs(text)
print(sent_list)
"""
['你好!', '欢迎使用。']
"""

ChineseCharCut

# Cut to Chinese chars
from pnlp import cut_zhchar
text = "你好,hello, 520 i love u. = ”我爱你“。"
char_list = cut_zhchar(text)
print(char_list)
"""
['你', '好', ',', 'hello', ',', ' ', '520', ' ', 'i', ' ', 'love', ' ', 'u', '.', ' ', '=', ' ', '”', '我', '爱', '你', '“', '。']
"""
char_list = cut_zhchar(text, remove_blank=True)
print(char_list)
"""
['你', '好', ',', 'hello', ',', '520', 'i', 'love', 'u', '.', '=', '”', '我', '爱', '你', '“', '。']
"""

CombineBucket

from pnlp import combine_bucket
parts = [
    '习近平指出',
    '中方不仅维护中国人民生命安全和身体健康',
    '也维护世界人民生命安全和身体健康',
    '我们本着公开',
    '透明'
]
buckets = combine_bucket(parts.copy(), 10, truncate=True, keep_remain=True)
print(buckets)
"""
['习近平指出', 
'中方不仅维护中国人民', 
'生命安全和身体健康', 
'也维护世界人民生命安', 
'全和身体健康', 
'我们本着公开透明']
"""

Normalization

from pnlp import num_norm
num_norm.num2zh(1024) == "一千零二十四"
num_norm.num2zh(1024).to_money() == "壹仟零贰拾肆"
num_norm.zh2num("一千零二十四") == 1024

StopWords

from pnlp import StopWords, chinese_stopwords, english_stopwords

csw = StopWords("/path/to/custom/stopwords.txt")
csw.stopwords # a set of the custom stopwords

csw.zh == chinese_stopwords # Chineses stopwords
csw.en == english_stopwords # English stopwords

Length

from pnlp import Length

text = "这是https://www.yam.gift长度测试,《 》*)FSJfdsjf😁![](http://xx.jpg)。233."

pl = Length(text)
# Note that even a pattern is used, the length is always for the raw text.
# Length is counted by character, not entire word or number.
print("Length of all characters: ", pl.len_all)
print("Length of all non-white characters: ", pl.len_nwh)
print("Length of all Chinese characters: ", pl.len_chi)
print("Length of all words and numbers: ", pl.len_wnb)
print("Length of all punctuations: ", pl.len_pun)
print("Length of all English characters: ", pl.len_eng)
print("Length of all numbers: ", pl.len_num)

"""
Length of all characters:  64
Length of all non-white characters:  63
Length of all Chinese characters:  6
Length of all words and numbers:  41
Length of all punctuations:  14
Length of all English characters:  32
Length of all numbers:  3
"""

Magic

from pnlp import MagicDict

# Nest dict
pmd = MagicDict()
pmd['a']['b']['c'] = 2
print(pmd)

"""
{'a': {'b': {'c': 2}}}
"""

# Preserve all repeated value-keys when a Dict is reversed.
dx = {1: 'a',
      2: 'a',
      3: 'a',
      4: 'b' }
print(pmag.MagicDict.reverse(dx))

"""
{'a': [1, 2, 3], 'b': 4}
"""

Test

Clone the repo run:

$ python -m pytest

ChangeLog

v0.3.1

Add cut_part to cut text to any parts by the given Regex Pattern; Add combine_bucket to combine any parts to buckets by the given threshold(length).

v0.3.0

Update cut_sentence; Add NumNorm.

v0.28-29

Update cut_zhchar.

v0.27

Add cut_zhchar.

v0.26

Add read_csv, remove as a sentence cut standard.

v0.25

Add stop_words.

v0.24

Fix read_json.

v0.23

Fix Text default rule.

v0.22

Make Text more convenient to use.

v0.21

Add cut_sentence method.

v0.20

Optimize several interface and make Text accept list of Regular Expression Patterns.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

pnlp-0.3.1.tar.gz (14.7 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

pnlp-0.3.1-py3-none-any.whl (25.5 kB view details)

Uploaded Python 3

File details

Details for the file pnlp-0.3.1.tar.gz.

File metadata

  • Download URL: pnlp-0.3.1.tar.gz
  • Upload date:
  • Size: 14.7 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/3.1.1 pkginfo/0.1.0 requests/2.23.0 setuptools/46.0.0 requests-toolbelt/0.9.1 tqdm/4.36.1 CPython/3.7.4

File hashes

Hashes for pnlp-0.3.1.tar.gz
Algorithm Hash digest
SHA256 ef6580a4d65bd7ca5e1c532152447314a570445422043856429f5056cdde22d6
MD5 f2b5780ac8556b7e06a845c02945ee60
BLAKE2b-256 0bb5dae5e1c163233b79865f39be9b77ac51fabe7f611b36a10d42e38218cff4

See more details on using hashes here.

File details

Details for the file pnlp-0.3.1-py3-none-any.whl.

File metadata

  • Download URL: pnlp-0.3.1-py3-none-any.whl
  • Upload date:
  • Size: 25.5 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/3.1.1 pkginfo/0.1.0 requests/2.23.0 setuptools/46.0.0 requests-toolbelt/0.9.1 tqdm/4.36.1 CPython/3.7.4

File hashes

Hashes for pnlp-0.3.1-py3-none-any.whl
Algorithm Hash digest
SHA256 33a80898cc8ae1e85789006f152379a883df1b5cfe9bb2e360a2e7c25a5bc5d8
MD5 758df4a0cc7dcd8e40ed6737c3d67937
BLAKE2b-256 fab574b40d4c09e7630a0b4ab5e4d4da471d59a4caf242bffad958262fcfee96

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page