postgresqleu/util/messaging/short.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

import re

# Functions for working with shortened posts

# This does not appear to match everything in any shape or form, but we are only
# using it against URLs that we have typed in ourselves, so it should be easy
# enough.
# Should be in sync with regexp in js/admin.js
_re_urlmatcher = re.compile(r'\bhttps?://\S+', re.I)

# This is currently the value for Twitter and the default for Mastodon, so just
# use that globally for now.
url_shortened_len = 23
_url_counts_as_characters = "https://short.url/{}".format((url_shortened_len - len("https://short.url/")) * 'x')


def get_shortened_post_length(txt):
    return len(_re_urlmatcher.sub(_url_counts_as_characters, txt))


# Truncate a text, taking into account URL shorterners. WIll not truncate in the middle of an URL,
# but right now will happily truncate in the middle of a word (room for improvement!)
def truncate_shortened_post(txt, maxlen):
    matches = list(_re_urlmatcher.finditer(txt))

    if not matches:
        # Not a single url, so just truncate
        return txt[:maxlen]

    firststart, firstend = matches[0].span()
    if firststart + url_shortened_len > maxlen:
        # We hit the size limit before the url or in the middle of it, so skip the whole url
        return txt[:firststart]

    inlen = firstend
    outlen = firststart + url_shortened_len
    for i, curr in enumerate(matches[1:]):
        prevstart, prevend = matches[i].span()
        currstart, currend = curr.span()

        betweenlen = currstart - prevend
        if outlen + betweenlen > maxlen:
            # The limit was hit in the text between urls
            left = maxlen - outlen
            return txt[:inlen + (maxlen - outlen)]
        if outlen + betweenlen + url_shortened_len > maxlen:
            # The limit was hit in the middle of this URL, so include all the text
            # up to it, but skip the url.
            return txt[:inlen + betweenlen]

        # The whole URL fit
        inlen += betweenlen + currend - currstart
        outlen += betweenlen + url_shortened_len

    return txt[:inlen + (maxlen - outlen)]