Improve performance of pagify() (#5698)

Co-authored-by: jack1142 <6032823+jack1142@users.noreply.github.com>
This commit is contained in:
Jakub Kuczys 2023-04-13 20:52:54 +02:00 committed by GitHub
parent 79d11e947c
commit 533f036ed2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 206 additions and 35 deletions

View File

@ -27,6 +27,10 @@ Chat Formatting
.. automodule:: redbot.core.utils.chat_formatting
:members:
:exclude-members: pagify
.. autofunction:: pagify(text, delims=('\n',), *, priority=False, escape_mass_mentions=True, shorten_by=8, page_length=2000)
:for:
Embed Helpers
=============

View File

@ -1,5 +1,8 @@
from __future__ import annotations
import datetime
import itertools
import math
import textwrap
from io import BytesIO
from typing import Iterator, List, Optional, Sequence, SupportsInt, Union
@ -200,17 +203,11 @@ def spoiler(text: str, escape_formatting: bool = True) -> str:
return f"||{escape(text, formatting=escape_formatting)}||"
def pagify(
text: str,
delims: Sequence[str] = ["\n"],
*,
priority: bool = False,
escape_mass_mentions: bool = True,
shorten_by: int = 8,
page_length: int = 2000,
) -> Iterator[str]:
class pagify(Iterator[str]):
"""Generate multiple pages from the given text.
The returned iterator supports length estimation with :func:`operator.length_hint()`.
Note
----
This does not respect code blocks or inline code.
@ -244,33 +241,82 @@ def pagify(
Pages of the given text.
"""
in_text = text
page_length -= shorten_by
while len(in_text) > page_length:
this_page_len = page_length
if escape_mass_mentions:
this_page_len -= in_text.count("@here", 0, page_length) + in_text.count(
"@everyone", 0, page_length
)
closest_delim = (in_text.rfind(d, 1, this_page_len) for d in delims)
if priority:
closest_delim = next((x for x in closest_delim if x > 0), -1)
else:
closest_delim = max(closest_delim)
closest_delim = closest_delim if closest_delim != -1 else this_page_len
if escape_mass_mentions:
to_send = escape(in_text[:closest_delim], mass_mentions=True)
else:
to_send = in_text[:closest_delim]
if len(to_send.strip()) > 0:
yield to_send
in_text = in_text[closest_delim:]
if len(in_text.strip()) > 0:
if escape_mass_mentions:
yield escape(in_text, mass_mentions=True)
else:
yield in_text
# when changing signature of this method, please update it in docs/framework_utils.rst as well
def __init__(
self,
text: str,
delims: Sequence[str] = ("\n",),
*,
priority: bool = False,
escape_mass_mentions: bool = True,
shorten_by: int = 8,
page_length: int = 2000,
) -> None:
self._text = text
self._delims = delims
self._priority = priority
self._escape_mass_mentions = escape_mass_mentions
self._shorten_by = shorten_by
self._page_length = page_length - shorten_by
self._start = 0
self._end = len(text)
def __repr__(self) -> str:
text = self._text
if len(text) > 20:
text = f"{text[:19]}\N{HORIZONTAL ELLIPSIS}"
return (
"pagify("
f"{text!r},"
f" {self._delims!r},"
f" priority={self._priority!r},"
f" escape_mass_mentions={self._escape_mass_mentions!r},"
f" shorten_by={self._shorten_by!r},"
f" page_length={self._page_length + self._shorten_by!r}"
")"
)
def __length_hint__(self) -> int:
return math.ceil((self._end - self._start) / self._page_length)
def __iter__(self) -> pagify:
return self
def __next__(self) -> str:
text = self._text
escape_mass_mentions = self._escape_mass_mentions
page_length = self._page_length
start = self._start
end = self._end
while (end - start) > page_length:
stop = start + page_length
if escape_mass_mentions:
stop -= text.count("@here", start, stop) + text.count("@everyone", start, stop)
closest_delim_it = (text.rfind(d, start + 1, stop) for d in self._delims)
if self._priority:
closest_delim = next((x for x in closest_delim_it if x > 0), -1)
else:
closest_delim = max(closest_delim_it)
stop = closest_delim if closest_delim != -1 else stop
if escape_mass_mentions:
to_send = escape(text[start:stop], mass_mentions=True)
else:
to_send = text[start:stop]
start = self._start = stop
if len(to_send.strip()) > 0:
return to_send
if len(text[start:end].strip()) > 0:
self._start = end
if escape_mass_mentions:
return escape(text[start:end], mass_mentions=True)
else:
return text[start:end]
raise StopIteration
def strikethrough(text: str, escape_formatting: bool = True) -> str:

View File

@ -1,5 +1,6 @@
import asyncio
import pytest
import operator
import random
from redbot.core.utils import (
bounded_gather,
@ -7,6 +8,8 @@ from redbot.core.utils import (
deduplicate_iterables,
common_filters,
)
from redbot.core.utils.chat_formatting import pagify
from typing import List
def test_deduplicate_iterables():
@ -137,3 +140,121 @@ async def test_bounded_gather_iter_cancel():
def test_normalize_smartquotes():
assert common_filters.normalize_smartquotes("Should\u2018 normalize") == "Should' normalize"
assert common_filters.normalize_smartquotes("Same String") == "Same String"
@pytest.mark.parametrize(
"text,pages,page_length",
(
# base case
(
"Line 1\nA longer line 2\n'tis a veeeeery long line numero tres\nand the last line",
[
"Line 1\nA",
" longer line 2",
"\n'tis a",
" veeeeery long",
" line numero",
" tres\nand the",
" last line",
],
15,
),
# mid-word split
(
"Interdisciplinary collaboration improves the quality\nof care.",
["Interdisciplinar", "y collaboration", " improves the", " quality\nof", " care."],
16,
),
# off-by-one errors
("Lorem ipsum dolor sit amet.", ["Lorem", " ipsum", " dolor", " sit", " amet."], 6),
(
"Lorem ipsum dolor sit amet.",
# TODO: "r" and " sit" can fit together but current logic doesn't support it properly
["Lorem", " ipsu", "m", " dolo", "r", " sit", " amet", "."],
5,
),
(
"Lorem ipsum dolor sit amet.",
["Lore", "m", " ips", "um", " dol", "or", " sit", " ame", "t."],
4,
),
# mass mentions
(
"@everyone listen to me!",
# TODO: off-by-one: " listen" and " to me!" should have been " listen to" and " me!"
["@\u200beveryone", " listen", " to me!"],
10,
),
(
"@everyone listen to me!",
["@everyon", "e listen", " to me!"],
9,
),
(
"@everyone listen to me!",
["@everyon", "e", " listen", " to me!"],
8,
),
("Is anyone @here?", ["Is anyone", " @\u200bhere?"], 10),
# whitespace-only page skipping (`\n` skipped)
("Split:\n Long-word", ["Split:", " Long-", "word"], 6),
),
)
def test_pagify(text: str, pages: List[str], page_length: int):
result = []
for page in pagify(text, ("\n", " "), shorten_by=0, page_length=page_length):
# sanity check
assert len(page) <= page_length
result.append(page)
assert pages == result
@pytest.mark.parametrize(
"text,pages,page_length",
(
# base case
(
"Line 1\nA longer line 2\n'tis a veeeeery long line numero tres\nand the last line",
[
"Line 1",
"\nA longer line",
" 2",
"\n'tis a",
" veeeeery long",
" line numero",
" tres",
"\nand the last",
" line",
],
15,
),
# mid-word split
(
"Interdisciplinary collaboration improves the quality\nof care.",
["Interdisciplinar", "y collaboration", " improves the", " quality", "\nof care."],
16,
),
),
)
def test_pagify_priority(text: str, pages: List[str], page_length: int):
result = []
for page in pagify(text, ("\n", " "), priority=True, shorten_by=0, page_length=page_length):
# sanity check
assert len(page) <= page_length
result.append(page)
assert pages == result
def test_pagify_length_hint():
it = pagify("A" * 100, shorten_by=0, page_length=10)
remaining = 100 // 10
assert operator.length_hint(it) == remaining
for page in it:
remaining -= 1
assert operator.length_hint(it) == remaining
assert operator.length_hint(it) == 0