binaryninja-api/scripts/check_docstring_formatting.py at dev · Vector35/binaryninja-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
#!/usr/bin/env python3
"""
Script to identify docstring formatting issues in Python files.

Checks for:
1. Bullet lists (lines starting with *, -, +) without blank line before them
2. Numbered lists (lines starting with digits and .) without blank line before them
3. Code blocks (lines starting with >>>) without blank line before them
4. reStructuredText directives (lines starting with ..) without blank line before them

These are common reStructuredText/Sphinx formatting issues that can cause
documentation to render incorrectly.

The script attempts to avoid false positives by:
- Skipping content inside literal blocks (after :: markers)
- Ignoring items that follow Sphinx field markers (:param:, :Example:, etc.)
- Handling Python interactive session output (lines between >>> prompts)
- Recognizing indented continuations

Known limitations:
- May flag some valid trailing >>> prompts in code examples
- Line numbers are approximate (offset from docstring start)
- Some complex nested structures may not be handled perfectly

Usage:
  python check_docstring_formatting.py [paths...]
  python check_docstring_formatting.py -v [paths...]

If no paths are specified, defaults to ../python relative to this script.
"""

import argparse
import os
import re
import ast
import sys
from pathlib import Path


def get_docstrings_from_file_regex(filepath, content):
    """
    Fallback docstring extraction using regex when AST parsing fails.
    This handles files with Python 3.10+ syntax like match statements.
    """
    docstrings = []
    lines = content.split('\n')

    # Find triple-quoted strings that appear after def/class or at module level
    in_docstring = False
    docstring_lines = []
    docstring_start = 0
    quote_style = None

    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        if not in_docstring:
            # Check for start of a docstring (triple quotes)
            for quote in ['"""', "'''"]:
                if quote in stripped:
                    # Check if it's the start of a docstring
                    idx = stripped.find(quote)
                    # Make sure it's not inside a comment or after code
                    before = stripped[:idx].strip()
                    if before == '' or before.endswith(':'):
                        in_docstring = True
                        quote_style = quote
                        docstring_start = i + 1
                        # Check if docstring ends on same line
                        after_start = stripped[idx + 3:]
                        if quote in after_start:
                            # Single line docstring
                            end_idx = after_start.find(quote)
                            docstring_content = after_start[:end_idx]
                            if docstring_content.strip():
                                docstrings.append((docstring_start, docstring_content, 'Unknown'))
                            in_docstring = False
                            quote_style = None
                        else:
                            docstring_lines = [after_start]
                        break
        else:
            # We're inside a docstring, look for the end
            if quote_style in stripped:
                # Found end of docstring
                end_idx = line.find(quote_style)
                docstring_lines.append(line[:end_idx])
                full_docstring = '\n'.join(docstring_lines)
                if full_docstring.strip():
                    docstrings.append((docstring_start, full_docstring, 'Unknown'))
                in_docstring = False
                docstring_lines = []
                quote_style = None
            else:
                docstring_lines.append(line)
        i += 1

    return docstrings


def get_docstrings_from_file(filepath):
    """Extract all docstrings from a Python file with their line numbers."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception:
        return []

    # Try AST parsing first
    try:
        tree = ast.parse(content, filename=str(filepath))
        docstrings = []

        for node in ast.walk(tree):
            # Only check nodes that can have docstrings
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)):
                try:
                    docstring = ast.get_docstring(node, clean=False)
                    if docstring:
                        # Get the line number where the docstring starts
                        if isinstance(node, ast.Module):
                            # Module docstring is at the top
                            line_num = 1
                        else:
                            # For functions/classes, it's the first statement
                            line_num = node.body[0].lineno if node.body else node.lineno

                        # Get the name of the function/class/module
                        if isinstance(node, ast.Module):
                            node_name = 'module'
                        else:
                            node_name = node.name
                        docstrings.append((line_num, docstring, node_name))
                except:
                    # Skip if we can't get the docstring
                    pass

        return docstrings
    except SyntaxError:
        # Fall back to regex-based extraction for files with newer Python syntax
        return get_docstrings_from_file_regex(filepath, content)


def check_docstring_formatting(docstring):
    """
    Check for formatting issues in a docstring.

    Returns a list of (line_offset, issue_description) tuples.
    """
    issues = []
    lines = docstring.split('\n')

    # Patterns that should have a blank line before them
    patterns = [
        (r'^\s*[\*\-\+]\s+', 'bullet list item'),
        (r'^\s*\d+\.\s+', 'numbered list item'),
        (r'^\s*>>>', 'code block'),
        (r'^\s*\.\.\s+', 'reStructuredText directive'),
    ]

    # Sphinx field patterns that can contain code blocks or lists
    sphinx_field_pattern = r'^\s*:[A-Za-z_][A-Za-z0-9_]*:'

    # Track if we're in a literal block (started by ::)
    in_literal_block = False
    literal_block_indent = 0

    for i, line in enumerate(lines):
        # Skip the first line (always part of the opening)
        if i == 0:
            continue

        current_indent = len(line) - len(line.lstrip())
        stripped = line.strip()

        # Check if previous line ended with :: (literal block marker)
        if i > 0:
            prev_line = lines[i - 1]
            if prev_line.rstrip().endswith('::'):
                in_literal_block = True
                literal_block_indent = len(prev_line) - len(prev_line.lstrip())

        # If we're in a literal block and dedented, we're out
        if in_literal_block and stripped and current_indent <= literal_block_indent:
            in_literal_block = False

        # Skip checks if we're inside a literal block
        if in_literal_block:
            continue

        # Check each pattern
        for pattern, description in patterns:
            if re.match(pattern, line):
                # Check if previous line is blank or also matches a list pattern
                prev_line = lines[i - 1] if i > 0 else ''

                # If previous line is not blank
                if prev_line.strip() != '':
                    # Check if previous line is also a list item (which is OK)
                    is_prev_list = any(re.match(p[0], prev_line) for p in patterns)

                    # Check if previous line is a Sphinx field (like :Example:, :param:, etc.)
                    is_sphinx_field = re.match(sphinx_field_pattern, prev_line)

                    # Check if we're indented under a previous section
                    # If current line is more indented than previous non-blank line, it's likely continuation
                    prev_indent = len(prev_line) - len(prev_line.lstrip())
                    is_indented_continuation = current_indent > prev_indent

                    # Special case for code blocks (>>>):
                    if description == 'code block':
                        # In Python interactive sessions, >>> prompts after output or continuations are normal
                        # Skip if: previous line is >>> or ..., OR both lines are indented (in code example)
                        if (prev_line.strip().startswith('...') or
                            prev_line.strip().startswith('>>>') or
                            (prev_indent > 0)):  # Both lines indented = inside code example
                            # Don't report this as an issue
                            break

                    # Special case for bullet/numbered lists:
                    # 1. Check if we're continuing a list (prev line is wrapped text from previous bullet)
                    # 2. Check if we're nested under another list item
                    is_nested_list = False
                    if description in ['bullet list item', 'numbered list item']:
                        # Look back to find context - skip blank lines
                        for j in range(i - 1, max(0, i - 10), -1):
                            check_line = lines[j]
                            if not check_line.strip():
                                continue  # Skip blank lines
                            check_indent = len(check_line) - len(check_line.lstrip())

                            # If we find a line at same indent that's also a list item, we're continuing a list
                            if check_indent == current_indent and any(re.match(p[0], check_line) for p in patterns):
                                is_nested_list = True  # This is a list continuation
                                break

                            # If we find a less-indented line that's also a list item, we're nested
                            if check_indent < current_indent and any(re.match(p[0], check_line) for p in patterns):
                                is_nested_list = True
                                break

                            # If we find a non-list line at current or less indent (intro text), stop looking
                            if check_indent <= current_indent:
                                break

                    if not is_prev_list and not is_sphinx_field and not is_indented_continuation and not is_nested_list:
                        issues.append((i + 1, f"{description} without blank line before it"))
                break  # Only report one issue per line

    return issues


def find_python_files(root_dir):
    """Find all Python files in the given directory."""
    root = Path(root_dir)
    return list(root.rglob('*.py'))


def main():
    parser = argparse.ArgumentParser(
        description='Check Python docstrings for formatting issues.'
    )
    parser.add_argument(
        'paths',
        nargs='*',
        help='Files or directories to check (default: python/ directory)'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='Show all files being checked, not just files with issues'
    )

    args = parser.parse_args()

    # Determine what to check
    if args.paths:
        files_to_check = []
        for path_str in args.paths:
            # Check if path contains glob characters
            if any(c in path_str for c in '*?['):
                matches = list(Path.cwd().glob(path_str))
                if not matches:
                    print(f"Warning: {path_str} did not match any files", file=sys.stderr)
                for path in matches:
                    if path.is_dir():
                        files_to_check.extend(find_python_files(path))
                    elif path.is_file() and path.suffix == '.py':
                        files_to_check.append(path)
            else:
                path = Path(path_str)
                if path.is_dir():
                    files_to_check.extend(find_python_files(path))
                elif path.is_file() and path.suffix == '.py':
                    files_to_check.append(path)
                else:
                    print(f"Warning: {path_str} is not a valid Python file or directory", file=sys.stderr)
    else:
        # Default to checking the python directory relative to this script
        script_dir = Path(__file__).parent
        python_dir = script_dir.parent / 'python'

        if not python_dir.exists():
            print(f"Error: Directory {python_dir} does not exist", file=sys.stderr)
            sys.exit(1)

        # Exclude examples subfolder when running with default path
        files_to_check = [f for f in find_python_files(python_dir)
                         if 'examples' not in f.parts]

    if args.verbose:
        print(f"Checking Python files...")
        print("=" * 80)

    files_with_issues = 0
    total_issues = 0

    for py_file in sorted(files_to_check):
        if args.verbose:
            print(f"Checking {py_file}...", end='', flush=True)

        docstrings = get_docstrings_from_file(py_file)
        file_issues = []

        for doc_line_num, docstring, node_type in docstrings:
            issues = check_docstring_formatting(docstring)
            if issues:
                for line_offset, issue_desc in issues:
                    # Calculate absolute line number in file
                    # This is approximate since we don't have exact positions
                    abs_line = doc_line_num + line_offset
                    file_issues.append((abs_line, issue_desc, node_type))

        if file_issues:
            files_with_issues += 1
            total_issues += len(file_issues)

            if args.verbose:
                print(f" {len(file_issues)} issue(s) found")
            else:
                print(f"{py_file}: {len(file_issues)} issue(s) found")

            for line_num, issue_desc, node_type in sorted(file_issues):
                print(f"  Line ~{line_num} ({node_type}): {issue_desc}")
        else:
            if args.verbose:
                print(" OK")

    if total_issues > 0:
        if args.verbose:
            print("=" * 80)
        print(f"\nFound {total_issues} issue(s) in {files_with_issues} file(s)")
        return 1
    else:
        if args.verbose:
            print("=" * 80)
            print("No issues found!")
        return 0


if __name__ == '__main__':
    sys.exit(main())