python-catalin: BeautifulSoup

Showing posts with label BeautifulSoup. Show all posts

Wednesday, April 16, 2025

Python Qt6 : Simple tool to convert HTML.

This is based on old tutorial from this post.

I add a class AgentPY class with all features for processing input and HtmlEditor class for .

Maybe I will use the agentpy module , but now works well without agents.

This is the source code:

from PyQt6.QtWidgets import QApplication, QMainWindow, QTextEdit, QMenu
from bs4 import BeautifulSoup

class AgentPy:
    """Clasă pentru procesarea și curățarea HTML-ului."""
    @staticmethod
    def clean_all_styles(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(True):
            if tag.name == "a":
                attrs_to_keep = {"href": tag.attrs.get("href")} if "href" in tag.attrs else {}
                tag.attrs = attrs_to_keep
            else:
                tag.attrs = {}
        return str(soup)

    @staticmethod
    def clean_empty_tags(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in soup.find_all(True):
            if not tag.contents or all(str(content).strip() == "" for content in tag.contents):
                tag.decompose()
        return str(soup)

    @staticmethod
    def clean_duplicate_tags(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        unique_tags = {}
        for tag in soup.find_all(True):
            tag_key = (tag.name, str(tag.attrs))
            if tag_key in unique_tags:
                tag.decompose()
            else:
                unique_tags[tag_key] = tag
        return str(soup)

    @staticmethod
    def convert_to_html(source_code):
        """Convertim caractere speciale din cod sursă în entități HTML."""
        # Creăm un dicționar pentru conversia caracterelor
        html_entities = {
            '<': '<',
            '>': '>',
            '&': '&',
            '"': '"',
            "'": ''',
            '[': '[',
            ']': ']',
        }
        # Înlocuim caracterele în codul sursă
        for char, entity in html_entities.items():
            source_code = source_code.replace(char, entity)
        return source_code

class HtmlEditor(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("catafest-cleaner-HTML")
        self.setGeometry(100, 100, 800, 600)

        # Editor de text
        self.editor = QTextEdit(self)
        self.setCentralWidget(self.editor)

        # Meniu contextual
        self.editor.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
        self.editor.customContextMenuRequested.connect(self.show_context_menu)

    def show_context_menu(self, position):
        menu = QMenu(self)
        clean_styles_action = menu.addAction("Clean basic HTML")
        clean_styles_action.triggered.connect(self.clean_all_styles)
        clean_empty_tags_action = menu.addAction("Clean Empty Tags")
        clean_empty_tags_action.triggered.connect(self.clean_empty_tags)
        clean_agentpy_action = menu.addAction("Clean AgentPy")
        clean_agentpy_action.triggered.connect(self.clean_duplicate_tags)
        convert_to_html_action = menu.addAction("Convert to HTML")  # Noua opțiune
        convert_to_html_action.triggered.connect(self.convert_to_html)
        menu.exec(self.editor.mapToGlobal(position))

    def clean_all_styles(self):
        html_content = self.editor.toPlainText()
        clean_html = AgentPy.clean_all_styles(html_content)
        self.editor.setPlainText(clean_html)

    def clean_empty_tags(self):
        html_content = self.editor.toPlainText()
        clean_html = AgentPy.clean_empty_tags(html_content)
        self.editor.setPlainText(clean_html)

    def clean_duplicate_tags(self):
        html_content = self.editor.toPlainText()
        clean_html = AgentPy.clean_duplicate_tags(html_content)
        self.editor.setPlainText(clean_html)

    def convert_to_html(self):
        source_code = self.editor.toPlainText()
        html_content = AgentPy.convert_to_html(source_code)
        self.editor.setPlainText(html_content)

if __name__ == "__main__":
    import sys
    from PyQt6.QtCore import Qt
    app = QApplication(sys.argv)
    window = HtmlEditor()
    window.show()
    sys.exit(app.exec())

Saturday, April 12, 2025

Python Qt6 : Simple tool for clean HTML.

Today I make a simple tool to clean the HTML from style and more. I used artificial inteligence from copilot.

This is the result of this simple tool.

from PyQt6.QtWidgets import QApplication, QMainWindow, QTextEdit, QMenu
from bs4 import BeautifulSoup

class HtmlEditor(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle("catafest-cleaner-HTML")  # Titlu actualizat
        self.setGeometry(100, 100, 800, 600)

        # Editor de text
        self.editor = QTextEdit(self)
        self.setCentralWidget(self.editor)

        # Meniu contextual
        self.editor.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
        self.editor.customContextMenuRequested.connect(self.show_context_menu)

    def show_context_menu(self, position):
        menu = QMenu(self)
        clean_styles_action = menu.addAction("Clean basic HTML")
        clean_styles_action.triggered.connect(self.clean_all_styles)
        clean_empty_tags_action = menu.addAction("Clean Empty Tags")
        clean_empty_tags_action.triggered.connect(self.clean_empty_tags)
        menu.exec(self.editor.mapToGlobal(position))

    def clean_all_styles(self):
        # Obține conținutul HTML din editor
        html_content = self.editor.toPlainText()

        # Utilizează BeautifulSoup pentru a procesa HTML-ul
        soup = BeautifulSoup(html_content, 'html.parser')

        # Elimină toate atributele, cu excepția celor din ancorele <a>
        for tag in soup.find_all(True):
            if tag.name == "a":  # Păstrează doar atributul 'href' pentru <a>
                attrs_to_keep = {"href": tag.attrs.get("href")} if "href" in tag.attrs else {}
                tag.attrs = attrs_to_keep
            else:
                tag.attrs = {}  # Elimină toate atributele pentru celelalte tag-uri

        # Actualizează conținutul editorului
        clean_html = str(soup)
        self.editor.setPlainText(clean_html)

    def clean_empty_tags(self):
        # Obține conținutul HTML din editor
        html_content = self.editor.toPlainText()

        # Utilizează BeautifulSoup pentru a procesa HTML-ul
        soup = BeautifulSoup(html_content, 'html.parser')

        # Elimină tag-urile goale
        for tag in soup.find_all(True):
            # Verifică tag-uri goale
            if not tag.contents or all(str(content).strip() == "" for content in tag.contents):  
                tag.decompose()  # Elimină tag-ul complet

        # Actualizează conținutul editorului
        clean_html = str(soup)
        self.editor.setPlainText(clean_html)

if __name__ == "__main__":
    import sys
    from PyQt6.QtCore import Qt
    app = QApplication(sys.argv)
    window = HtmlEditor()
    window.show()
    sys.exit(app.exec())

Saturday, July 20, 2019

Python 3.7.3 : Use BeautifulSoup to parse Instagram account.

This example is a bit more complex because it parses the source code in a more particular way depending on it.
The basic idea of this script is to take the content of an Instagram account in the same way as a web browser.
For my account I found a parsing error, I guess the reason is using the points, see festila.george.catalin.

    scripts_content = json.loads(scripts[0].text.strip())
IndexError: list index out of range

In this case comment this line of code and will work:
For the other accounts I've tried, it works very well with the default script.
This is the script I used:

import requests
from bs4 import BeautifulSoup
import json
import re

from pprint import pprint

instagram_url = 'https://instagram.com'
#example user instagram profile_url = sherwoodseries
profile_url=str(input("name of the instagram user: "))


#UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f48b' in posit ion 5022: character maps to 
#fix write text file with  encoding='utf-8'
file1 = open("_shared_data.txt","w", encoding='utf-8') 

#profile_url = 'festila.george.catalin'
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(f"{instagram_url}/{profile_url}", headers = headers)

if response.ok:
    html = response.text
    bs_html = BeautifulSoup(html, "html.parser")
    print(bs_html)
    # get info from ... type="application/ld+json">{"@context":"http:\/\/schema.org","@type":"Person","name":
    scripts = bs_html.select('script[type="application/ld+json"]')
    #scripts_content = json.loads(scripts[0].text.strip())
    #pprint(scripts_content)

    #print scripts_content like json 
    #print(json.dumps(scripts_content,indent = 4,sort_keys = True))

    #print just part of source code get by 'script' (0 .. n), see n = 6 
    #print(bs_html.find_all('script')[6])
    script_tag = bs_html.find('script', text=re.compile('window\._sharedData'))
    shared_data = script_tag.string.partition('=')[-1].strip(' ;')

    #get item from shared data, see "language_code":"en"
    rex_item  = re.compile('(?<=\"language_code\":\")[a-zA-Z_\- ]+(?=\")')
    rex_get_item = rex_item.findall(shared_data)  
    print(rex_get_item)
    #get url image from shared data
    rex_url  = re.compile('(?<=\"display_url\":\")[^\s\"]+(?=\")')
    rex_get_url = rex_url.findall(shared_data)  
    print(rex_get_url)
 
    # load like a json 
    result_json = json.loads(shared_data)
    pprint(result_json)
    
    data = bs_html.find_all('meta', attrs={'property': 'og:description'})
    bb = data[0].get('content').split()
    user = '%s %s %s' % (bb[-3], bb[-2], bb[-1])
    # get from bb parts 
    posts = bb[4]
    print('all string: ',bb)
    print('number of posts: ',posts)
    print('name and the user: ',user)

    # write any output show by print into _a.txt file, see example
    #file1.write(str(bs_html.find_all('script')[4]))
    #example: write to _shared_data.txt file the shared_data
    #file1.write(str(shared_data))
#after write, close the file 
#file1.close()

This is a part of the output for sherwoodseries account:

...
all string:  ['95', 'Followers,', '24', 'Following,', '56', 'Posts', '-', 'See',
 'Instagram', 'photos', 'and', 'videos', 'from', 'Sherwood', 'Series', '(@sherwo
odseries)']
number of posts:  56
name and the user:  Sherwood Series (@sherwoodseries)

Thursday, July 11, 2019

Python 3.7.3 : Three examples with BeautifulSoup.

Beautiful Soup is a library that makes it easy to scrape information from web pages. It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree., see the pypi webpage.
This python module was created by Leonard Richardson.
A large definition can be this:
Web Scraping (also termed Screen Scraping, Web Data Extraction, Web Harvesting, etc.) is a technique employed to extract large amounts of data from websites whereby the data is extracted and saved to a local file in your computer or to a database in table (spreadsheet) format.
This python module can do that but the input format and output format is different.
The input can be a webpage like an URL or webpage with all pieces of information and the output depends by the this and the user choices.
Les's see some examples:
First example show you how to take content of the first row table from a wikipedia webpage.

# get table from wikipedia 
import requests
from bs4 import BeautifulSoup
website_url = requests.get('https://en.wikipedia.org/w/index.php?title=Table_of_food_nutrients').text
soup = BeautifulSoup(website_url,'lxml')

my_table = soup.find('table',{'class':'wikitable collapsible collapsed'})
links = my_table.findAll('a')
Food = []
for link in links:
    Food.append(link.get('title'))

print(Food)

The next example takes all files from a page


# get links using the url
import urllib
from bs4 import BeautifulSoup
page = urllib.request.urlopen('http://____share.net/filmes/').read()
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
    print (anchor['href'])

The last example takes all images from the search query of imgur website:

# get images from imgur search query
import urllib
from bs4 import BeautifulSoup
url = 'https://imgur.com/search/score?q=cyborg'
with urllib.request.urlopen(url) as f:
    soup = BeautifulSoup(f.read(),'lxml')

a_tags = soup.findAll("a",{"class":"image-list-link"})
img_tags = [a.find("img") for a in a_tags]
print(img_tags)
srcs = []
for s in img_tags:
    src_tags=('http:'+s['src'])
    srcs.append(src_tags)

print(srcs)

As a conclusion, this module will pose problems for those who do not understand how to scroll through the source code, the content of web pages, how to read 'lxml', 'page', etc.
It will greatly help your Chrome F12 key to access parts of web content.

python-catalin

analitics

Pages