diff --git a/docs/after.png b/docs/after.png new file mode 100644 index 0000000..1a8e0dc Binary files /dev/null and b/docs/after.png differ diff --git a/docs/before.png b/docs/before.png new file mode 100644 index 0000000..626f0c3 Binary files /dev/null and b/docs/before.png differ diff --git a/poetry.lock b/poetry.lock index ce3ef34..e5ffa38 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "certifi" @@ -272,14 +272,14 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "urllib3" -version = "2.6.2" +version = "2.6.3" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, - {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, + {file = "urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4"}, + {file = "urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed"}, ] [package.extras] diff --git a/pyproject.toml b/pyproject.toml index cec993c..f28ced6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-substack" -version = "0.1.17" +version = "0.1.18" description = "A Python wrapper around the Substack API." authors = ["Paolo Mazza "] license = "MIT" diff --git a/substack/post.py b/substack/post.py index de262ee..3d9c01f 100644 --- a/substack/post.py +++ b/substack/post.py @@ -39,40 +39,41 @@ def parse_inline(text: str) -> List[Dict]: tokens = [] # Process text character by character to handle nested formatting # We'll use regex to find all markdown patterns, then process them in order - + # Find all markdown patterns: links, bold, italic # Pattern order: links first (to avoid conflicts), then bold, then italic link_pattern = r'\[([^\]]+)\]\(([^)]+)\)' bold_pattern = r'\*\*([^*]+)\*\*' italic_pattern = r'(? 0 and text[match.start()-1:match.start()+1] != "![": + # But do NOT skip normal links at position 0. + if match.start() == 0 or text[match.start()-1:match.start()+1] != "![": matches.append((match.start(), match.end(), "link", match.group(1), match.group(2))) - + for match in re.finditer(bold_pattern, text): # Check if this range is already covered by a link if not any(start <= match.start() < end for start, end, _, _, _ in matches): matches.append((match.start(), match.end(), "bold", match.group(1), None)) - + for match in re.finditer(italic_pattern, text): # Check if this range is already covered by a link or bold if not any(start <= match.start() < end for start, end, _, _, _ in matches): matches.append((match.start(), match.end(), "italic", match.group(1), None)) - + # Sort matches by position matches.sort(key=lambda x: x[0]) - + # Build tokens last_pos = 0 for start, end, match_type, content, url in matches: # Add text before this match if start > last_pos: tokens.append({"content": text[last_pos:start]}) - + # Add the formatted content if match_type == "link": tokens.append({ @@ -89,16 +90,16 @@ def parse_inline(text: str) -> List[Dict]: "content": content, "marks": [{"type": "em"}] }) - + last_pos = end - + # Add remaining text if last_pos < len(text): tokens.append({"content": text[last_pos:]}) - + # Filter out empty tokens tokens = [t for t in tokens if t.get("content")] - + return tokens @@ -351,7 +352,7 @@ def marks(self, marks): for mark in marks: new_mark = {"type": mark.get("type")} if mark.get("type") == "link": - href = mark.get("href") + href = mark.get("href") or mark.get("attrs", {}).get("href") new_mark.update({"attrs": {"href": href}}) content_marks.append(new_mark) content["marks"] = content_marks @@ -572,7 +573,7 @@ def from_markdown(self, markdown_content: str, api=None): alt_text = linked_image_match.group(1) image_url = linked_image_match.group(2) link_url = linked_image_match.group(3) - + # Adjust image URL if it starts with a slash image_url = image_url[1:] if image_url.startswith("/") else image_url @@ -613,22 +614,49 @@ def from_markdown(self, markdown_content: str, api=None): # Process paragraphs or bullet lists else: if "\n" in text_content: - # Process each line separately (for bullet lists) + # Process each line, grouping consecutive bullets + # into a single bullet_list node + pending_bullets: List[List[Dict]] = [] + + def flush_bullets(): + if not pending_bullets: + return + list_items = [] + for bullet_nodes in pending_bullets: + list_items.append({ + "type": "list_item", + "content": [{"type": "paragraph", "content": bullet_nodes}], + }) + self.draft_body["content"].append( + {"type": "bullet_list", "content": list_items} + ) + pending_bullets.clear() + for line in text_content.split("\n"): line = line.strip() if not line: + flush_bullets() continue - # Remove bullet marker if present + + # Check for bullet marker + bullet_text = None if line.startswith("* "): - line = line[2:].strip() + bullet_text = line[2:].strip() elif line.startswith("- "): - line = line[2:].strip() + bullet_text = line[2:].strip() elif line.startswith("*") and not line.startswith("**"): - line = line[1:].strip() - - if line: + bullet_text = line[1:].strip() + + if bullet_text is not None: + tokens = parse_inline(bullet_text) + if tokens: + pending_bullets.append(tokens) + else: + flush_bullets() tokens = parse_inline(line) self.add({"type": "paragraph", "content": tokens}) + + flush_bullets() else: # Single paragraph tokens = parse_inline(text_content) diff --git a/tests/substack/test_post.py b/tests/substack/test_post.py new file mode 100644 index 0000000..c8e1512 --- /dev/null +++ b/tests/substack/test_post.py @@ -0,0 +1,82 @@ +"""Tests for Post and parse_inline.""" + +import json + +from substack.post import Post, parse_inline + + +class TestParseInline: + """Tests for parse_inline link handling.""" + + def test_link_at_start_of_text(self): + """Links at position 0 should be parsed correctly.""" + result = parse_inline("[GPT](https://openai.com/)") + assert len(result) == 1 + assert result[0]["content"] == "GPT" + assert result[0]["marks"][0]["attrs"]["href"] == "https://openai.com/" + + def test_multiple_links_on_same_line(self): + """All links on the same line should be parsed.""" + result = parse_inline( + "[GPT](https://openai.com/) and [Claude](https://anthropic.com/)" + ) + links = [r for r in result if r.get("marks")] + assert len(links) == 2 + assert links[0]["content"] == "GPT" + assert links[0]["marks"][0]["attrs"]["href"] == "https://openai.com/" + assert links[1]["content"] == "Claude" + assert links[1]["marks"][0]["attrs"]["href"] == "https://anthropic.com/" + + def test_image_not_parsed_as_link(self): + """Image syntax ![alt](url) should not be parsed as a link.""" + result = parse_inline("![alt](https://example.com/img.png)") + links = [r for r in result if r.get("marks")] + assert len(links) == 0 + + def test_link_mid_text(self): + """Links in the middle of text should work.""" + result = parse_inline("Check [this](https://example.com) out") + links = [r for r in result if r.get("marks")] + assert len(links) == 1 + assert links[0]["marks"][0]["attrs"]["href"] == "https://example.com" + + +class TestPostMarks: + """Tests for Post.marks() link href handling.""" + + def test_marks_preserves_href_from_attrs(self): + """marks() should read href from attrs when present.""" + post = Post(title="Test", subtitle="", user_id=1) + post.from_markdown("[Example](https://example.com)") + body = json.loads(post.get_draft()["draft_body"]) + # Find the link mark + for block in body["content"]: + for node in block.get("content", []): + for mark in node.get("marks", []): + if mark.get("type") == "link": + assert mark["attrs"]["href"] == "https://example.com" + return + raise AssertionError("No link mark found in output") + + def test_marks_preserves_href_from_top_level(self): + """marks() should also work when href is at top level (legacy format).""" + post = Post(title="Test", subtitle="", user_id=1) + post.add( + { + "type": "paragraph", + "content": [ + { + "content": "Link", + "marks": [{"type": "link", "href": "https://example.com"}], + } + ], + } + ) + body = json.loads(post.get_draft()["draft_body"]) + for block in body["content"]: + for node in block.get("content", []): + for mark in node.get("marks", []): + if mark.get("type") == "link": + assert mark["attrs"]["href"] == "https://example.com" + return + raise AssertionError("No link mark found in output")