Page Parser Utils For scraping, List index update
Project description
Parse Utilities (ParseUtils)
This package helps you extracting python dict from html/xml contents
Installation
pip install parse-utils
Usage
from parse_utils.page_parser import PageParser, ItemExtractor
html_data = """
<html>
<head><title>This is title</title></head>
<body>
<p id="header">This is header id</p>
<p id="header">This is header2</p>
<p class="content">This is content</p>
</body>
</html>
"""
html_rows_data = """
<html>
<head><title>This is title</title></head>
<body>
<ul id="contents">
<li><a href="/first"> First Item</a><p>Description of Item 1</p></li>
<li><a href="/Second"> Second Item</a><p>Description of Item 2</p></li>
<li><a href="/Second"> Second Item</a><p>Description of Item 3</p></li>
<li><a href="/Second"> First Item</a><p>Description of Item 4</p></li>
</ul>
<p class="content">This is content</p>
</body>
</html>
"""
json_data = {"name": "Yogendra", "address": {"country": "Nepal", "city": "Pokhara",}}
def test_html_parser():
"""
"""
config = {
"header": ['//p[@id="header"]/text()'],
"content": ['//p[@class="content"]'],
"description": ["//body"],
}
pparser = PageParser(html_data)
item = pparser.extract_dict(config)
item2 = pparser.extract_dict(config, is_list=True)
print(item2["body"])
item3 = pparser.extract_dict(config, linebreaks=True)
print(item3["body"])
print(item)
def test_json_parser():
"""
"""
config = {
"header": ["name"],
"city": ["address", "city"],
}
jparser = PageParser(json_data, selector=True)
item = jparser.extract_dict_from_json(config)
print(item)
def test_items_parser():
"""
"""
config = {
"results": "//ul/li",
"fields": {
"title": ["./a/text()"],
"description": ["./p"],
"link": ["./a/@href"],
},
}
for item in ItemExtractor.extract_items(
config["results"], config["fields"], html_rows_data
):
print(item)
def test_items_parser_with_seed():
"""
"""
seed_dict = {'default_key': 'default_value'}
config = {
"results": "//ul/li",
"fields": {
"title": ["./a/text()"],
"description": ["./p"],
"link": ["./a/@href"],
},
}
for item in ItemExtractor.extract_items(
config["results"], config["fields"], html_rows_data, item=seed_dict
):
print(item)
def test_items_parser_with_results():
"""
"""
seed_dict = {'default': 'default_list'}
config = {
"results": ["//apple/ball", "//ul/li"],
"fields": {
"title": ["./a/text()"],
"description": ["./p"],
"link": ["./a/@href"],
},
}
for item in ItemExtractor.extract_items(
config["results"], config["fields"], html_rows_data, item=seed_dict
):
print(item)
if __name__ == "__main__":
test_html_parser()
test_json_parser()
test_items_parser()
test_items_parser_with_seed()
test_items_parser_with_results()
Output will be:
['This is header id\n This is header2\n This is content']
This is header id
This is header2
This is content
{'header': 'This is header id', 'content': 'This is content', 'body': 'This is header id\n This is header2\n This is content'}
{'header': 'Yogendra', 'city': 'Pokhara'}
{'title': 'First Item', 'description': 'Description of Item 1', 'link': '/first'}
{'title': 'Second Item', 'description': 'Description of Item 2', 'link': '/Second'}
{'title': 'Second Item', 'description': 'Description of Item 3', 'link': '/Second'}
{'title': 'First Item', 'description': 'Description of Item 4', 'link': '/Second'}
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
parse-utils-1.3.2.tar.gz
(3.7 kB
view hashes)
Built Distribution
Close
Hashes for parse_utils-1.3.2-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | ad28c50a2eb3737723e6a1ed175d00566c6be80436bdf859fe4685a6d7547696 |
|
MD5 | 31cb9b3d5cd305cf9ced0de9f93c5f37 |
|
BLAKE2b-256 | 4b5c836b2cca76adc9b536c467dcf376550577fb9fb05cbc451651f7e314f1c0 |