Source code for teek.extras.soup

import contextlib
import functools
import itertools
import re
import threading
import urllib.request
import warnings

import teek
from teek.extras import links
try:
    from teek.extras import image_loader
except ImportError:
    from teek.extras import image_loader_dummy as image_loader


[docs]class SoupViewer: """Displays BeautifulSoup_ HTML elements in a text widget. .. BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ .. note:: If the soup contains ``<img>`` tags, the images are read or downloaded automatically by default. Subclass :class:`SoupViewer` and override :meth:`download` if you don't want that. Images are loaded in threads, so make sure to use :func:`teek.init_threads`. Alternatively, you can pass ``threads=False``, and the images won't be loaded at all. .. attribute:: widget The :class:`teek.Text` widget that everything is added to. """ def __init__(self, textwidget, threads=True): self._use_threads = threads self.widget = textwidget self.widget.bind('<Destroy>', functools.partial(self.stop_loading, cleanup=True)) self._image_mark_names = ('soup-img-' + str(i) for i in itertools.count(1)) self._loading_id = 1 self._loaded_images = []
[docs] def download(self, url): """Downloads the content of the URL, and returns it as bytes. This method is called whenever the soup contains an ``<img>`` or something else that has to be read from a file or downloaded. If it raises an exception, the ``alt`` of the ``<img>`` will be displayed instead of the actual image, if there is an ``alt``. The ``alt`` is also displayed while this method is running. By default, this uses :func:`urllib.request.urlopen`. You can override this if ``urllib`` is doing something dumb or you want to control which things can be downloaded. Usually this is called from some other thread than the main thread. """ with urllib.request.urlopen(url) as response: return response.read()
[docs] @teek.make_thread_safe def stop_loading(self, cleanup=True): """Tell currently running threads to do nothing to the :attr:`widget`. Things like ``<img>`` elements are loaded with threads, so they might add something to the text widget several seconds after the :meth:`add_soup` call. If ``cleanup`` is ``True``, this method also e.g. deletes already loaded images, because then it assumes that they are not needed anymore. This means that if you don't pass ``cleanup=False``, you should clear the text widget after calling this method. This is automatically called with ``cleanup=True`` when the :attr:`widget` is destroyed. """ self._loading_id += 1 if cleanup: while self._loaded_images: self._loaded_images.pop().delete()
[docs] def create_tags(self): """ Adds :ref:`text tags <textwidget-tags>` to the :attr:`widget` for displaying the soup elements. This is not called automatically; you should call this before actually using the ``SoupViewer``. Each text tag is named with ``'soup-'`` followed by the name of the corresponding HTML tag, such as ``'soup-p'`` or ``'soup-pre'``. If you are not happy with what this method does, you can change the text tags after calling it. """ monospace_family = teek.NamedFont('TkFixedFont').family family = self.widget.config['font'].family basic_size = self.widget.config['font'].size # may be negative h_sizes = { 'h1': round(2.5 * basic_size), 'h2': round(2.0 * basic_size), 'h3': round(1.6 * basic_size), 'h4': round(1.45 * basic_size), 'h5': round(1.25 * basic_size), 'h6': round(1.1 * basic_size), } # because pep8 line length tag = self.widget.get_tag # these tags don't need any special settings, but they need to be # created to avoid warnings in soup2teek tag('soup-p') tag('soup-ol') tag('soup-ul') # there's no soup-a because teek.extras.links handles that tag('soup-code')['font'] = (monospace_family, basic_size, '') tag('soup-pre')['font'] = (monospace_family, basic_size, '') tag('soup-pre')['lmargin1'] = 30 tag('soup-pre')['lmargin2'] = 50 tag('soup-li')['lmargin1'] = 10 tag('soup-li')['lmargin2'] = 10 tag('soup-strong')['font'] = (family, basic_size, 'bold') tag('soup-b')['font'] = (family, basic_size, 'bold') tag('soup-em')['font'] = (family, basic_size, 'italic') tag('soup-i')['font'] = (family, basic_size, 'italic') for h, size in h_sizes.items(): tag('soup-' + h)['font'] = (family, size, 'bold') # make sure that html_pre's indenting stuff works inside list elements tag('soup-li').lower('soup-pre')
[docs] def add_soup(self, element): """Render a BeautifulSoup4 HTML element. The text, images, or whatever the element represents are added to the end of the text widget. This method looks for methods whose names are ``handle_`` followed by the name of a HTML tag; for example, ``handle_h1()`` or ``handle_p()``. Those methods run when an element with the corresponding tag name is added. You can subclass :class:`SoupViewer` and create more of these methods to handle more different kinds of tags. There are two things that the methods can do: 1. The method can return None to indicate that :meth:`add_soup` shouldn't do anything with the content of the element. :: def handle_pre(self, pre): self.widget.insert(self.widget.end, pre.text.rstrip() + '\ \\n\\n') 2. The method can be decorated with :func:`contextlib.contextmanager`. When it yields, :meth:`add_soup` will loop over the element and call itself recursively with each subelement. :: @contextlib.contextmanager def handle_ul(self, ul): for li in ul: if li.name == 'li': # '\\N{bullet} ' creates a Unicode black circle ch\ aracter li.insert(0, '\\N{bullet} ') yield # the content of the ul is added here self.widget.insert(self.widget.end, '\\n') In either case, :meth:`add_soup` adds a :ref:`textwidget tag <textwidget-tags>` as explained in :meth:`create_tags`. """ # beautifulsoup is buggy, sometimes this recurses infinitely and # sometimes this raises AttributeError # # see handle_ul() for an example of how this would be nice, if this # worked #element = copy.deepcopy(element) if element.name is None: # plain text, handle it kind of like web browsers do # \xa0 is non-breaking space text = str(element) text = re.sub(r'[^\S\xa0]+', ' ', text) last_char = self.widget.get( self.widget.end.back(chars=1), self.widget.end) if last_char.isspace(): text = text.lstrip(' ') self.widget.insert(self.widget.end, text) return try: handler = getattr(self, 'handle_' + element.name) except AttributeError: omg = ("soup contains a <%s> tag, but %s has no handle_%s() method" % (element.name, type(self).__name__, element.name)) warnings.warn(omg, RuntimeWarning) handler = self._fallback_handler old_end = self.widget.end handler_result = handler(element) if handler_result is not None: with handler_result: for sub in element: self.add_soup(sub) self.widget.get_tag('soup-' + element.name).add( old_end, self.widget.end)
@contextlib.contextmanager def _fallback_handler(self, element): yield def handle_pre(self, element): self.widget.insert(self.widget.end, element.text.rstrip() + '\n\n') def handle_br(self, element): self.widget.insert(self.widget.end, '\n') @contextlib.contextmanager def _do_nothing_handler(self, element): yield handle_i = handle_em = _do_nothing_handler handle_b = handle_strong = _do_nothing_handler handle_code = _do_nothing_handler @contextlib.contextmanager def _double_newline_handler(self, element): yield self.widget.insert(self.widget.end, '\n\n') handle_h1 = _double_newline_handler handle_h2 = _double_newline_handler handle_h3 = _double_newline_handler handle_h4 = _double_newline_handler handle_h5 = _double_newline_handler handle_h6 = _double_newline_handler handle_p = _double_newline_handler @contextlib.contextmanager def handle_ul(self, element): for li in element: if li.name == 'li': li.insert(0, '\N{bullet} ') yield self.widget.insert(self.widget.end, '\n') @contextlib.contextmanager def handle_ol(self, element): for num, li in enumerate((sub for sub in element if sub.name == 'li'), start=1): li.insert(0, str(num) + '. ') yield self.widget.insert(self.widget.end, '\n') @contextlib.contextmanager def handle_li(self, element): yield last_char = self.widget.get(self.widget.end.back(chars=1)) if last_char != '\n': self.widget.insert(self.widget.end, '\n') @contextlib.contextmanager def handle_a(self, element): start = self.widget.end yield end = self.widget.end links.add_url_link(self.widget, element.attrs['href'], start, end) @contextlib.contextmanager def handle_img(self, element): loading_id = self._loading_id mark_name = next(self._image_mark_names) # TODO: add 'mark gravity' to teek self.widget.marks[mark_name + '-start'] = self.widget.end teek.tcl_call(None, self.widget, 'mark', 'gravity', mark_name + '-start', 'left') self.widget.insert(self.widget.end, element.attrs.get('alt', '')) self.widget.marks[mark_name + '-end'] = self.widget.end teek.tcl_call(None, self.widget, 'mark', 'gravity', mark_name + '-end', 'left') if self._use_threads: # daemon=True because i don't care wtf happens to this thread threading.Thread( target=self._image_loader_thread, args=[mark_name, element.attrs['src'], loading_id], daemon=True).start() yield def _image_loader_thread(self, mark_name, src, loading_id): bytez = self.download(src) if loading_id == self._loading_id: self._add_image(mark_name, bytez) # only one of these will be running at a time, because the decoration @teek.make_thread_safe def _add_image(self, mark_name, bytez): image = image_loader.from_bytes(bytez) self._loaded_images.append(image) start_pos = self.widget.marks[mark_name + '-start'] end_pos = self.widget.marks[mark_name + '-end'] tags = self.widget.get_all_tags(start_pos) self.widget.delete(start_pos, end_pos) teek.tcl_call(None, self.widget, 'image', 'create', start_pos, '-image', image) for tag in tags: tag.add(start_pos, start_pos.forward(chars=1))