TaskWeaver Browserless Plugin

TaskWeaver Browserless Plugin

scrape_website.py

import os
import json
import requests
from taskweaver.plugin import Plugin, register_plugin
from unstructured.partition.html import partition_html

@register_plugin
class ScrapePlugin(Plugin):
def call(self, website: str):
url = f"https://chrome.browserless.io/content?token={os.environ['BROWSERLESS_API_KEY']}"
payload = json.dumps({"url": website})
headers = {'cache-control': 'no-cache', 'content-type': 'application/json'}
response = requests.request("POST", url, headers=headers, data=payload)
elements = partition_html(text=response.text)
content = "\n\n".join([str(el) for el in elements])
return content

scrape_website.yaml

name: scrape_website
enabled: true
required: false
description: >
The ScrapePlugin scrapes website content.
Pass a URL to receive the raw content of the website.

parameters:

name: website type: str required: true description: The full URL of the website to scrape.

returns:

name: scraped_content type: str description: > A string containing the raw content of the scraped website.