Source code for semantic_release.commit_parser.conventional.parser

from __future__ import annotations

from functools import reduce
from logging import getLogger
from re import (
    DOTALL,
    IGNORECASE,
    MULTILINE,
    Match as RegexMatch,
    Pattern,
    compile as regexp,
    error as RegexError,  # noqa: N812
)
from textwrap import dedent
from typing import TYPE_CHECKING, ClassVar

from git.objects.commit import Commit

from semantic_release.commit_parser._base import CommitParser
from semantic_release.commit_parser.conventional.options import (
    ConventionalCommitParserOptions,
)
from semantic_release.commit_parser.token import (
    ParsedCommit,
    ParsedMessageResult,
    ParseError,
    ParseResult,
)
from semantic_release.commit_parser.util import (
    breaking_re,
    deep_copy_commit,
    force_str,
    parse_paragraphs,
)
from semantic_release.enums import LevelBump
from semantic_release.errors import InvalidParserOptions
from semantic_release.helpers import sort_numerically, text_reducer

if TYPE_CHECKING:
    pass


# TODO: Remove from here, allow for user customization instead via options
# types with long names in changelog
LONG_TYPE_NAMES = {
    "build": "build system",
    "ci": "continuous integration",
    "chore": "chores",
    "docs": "documentation",
    "feat": "features",
    "fix": "bug fixes",
    "perf": "performance improvements",
    "refactor": "refactoring",
    "style": "code style",
    "test": "testing",
}


[docs] class ConventionalCommitParser( CommitParser[ParseResult, ConventionalCommitParserOptions] ): """ A commit parser for projects conforming to the conventional commits specification. See https://www.conventionalcommits.org/en/v1.0.0/ """ # TODO: Deprecate in lieu of get_default_options() parser_options = ConventionalCommitParserOptions # GitHub & Gitea use (#123), GitLab uses (!123), and BitBucket uses (pull request #123) mr_selector = regexp(r"[\t ]+\((?:pull request )?(?P<mr_number>[#!]\d+)\)[\t ]*$") issue_selector = regexp( str.join( "", [ r"^(?:clos(?:e|es|ed|ing)|fix(?:es|ed|ing)?|resolv(?:e|es|ed|ing)|implement(?:s|ed|ing)?):", r"[\t ]+(?P<issue_predicate>.+)[\t ]*$", ], ), flags=MULTILINE | IGNORECASE, ) notice_selector = regexp(r"^NOTICE: (?P<notice>.+)$") common_commit_msg_filters: ClassVar[dict[str, tuple[Pattern[str], str]]] = { "typo-extra-spaces": (regexp(r"(\S) +(\S)"), r"\1 \2"), "git-header-commit": ( regexp(r"^[\t ]*commit [0-9a-f]+$\n?", flags=MULTILINE), "", ), "git-header-author": ( regexp(r"^[\t ]*Author: .+$\n?", flags=MULTILINE), "", ), "git-header-date": ( regexp(r"^[\t ]*Date: .+$\n?", flags=MULTILINE), "", ), "git-squash-heading": ( regexp( r"^[\t ]*Squashed commit of the following:.*$\n?", flags=MULTILINE, ), "", ), } def __init__(self, options: ConventionalCommitParserOptions | None = None) -> None: super().__init__(options) self._logger = getLogger( str.join(".", [self.__module__, self.__class__.__name__]) ) try: commit_type_pattern = regexp( r"(?P<type>%s)" % str.join("|", self.options.allowed_tags) ) except RegexError as err: raise InvalidParserOptions( str.join( "\n", [ f"Invalid options for {self.__class__.__name__}", "Unable to create regular expression from configured commit-types.", "Please check the configured commit-types and remove or escape any regular expression characters.", ], ) ) from err self.commit_subject = regexp( str.join( "", [ f"^{commit_type_pattern.pattern}", r"(?:\((?P<scope>[^\n]+)\))?", r"(?P<break>!)?:\s+", r"(?P<subject>[^\n]+)", ], ) ) self.commit_msg_pattern = regexp( str.join( "", [ self.commit_subject.pattern, r"(?:\n\n(?P<text>.+))?", # commit body ], ), flags=DOTALL, ) self.filters: dict[str, tuple[Pattern[str], str]] = { **self.common_commit_msg_filters, "git-squash-commit-prefix": ( regexp( str.join( "", [ r"^(?:[\t ]*[*-][\t ]+|[\t ]+)?", # bullet points or indentation commit_type_pattern.pattern + r"\b", # prior to commit type ], ), flags=MULTILINE, ), # move commit type to the start of the line r"\1", ), }
[docs] def get_default_options(self) -> ConventionalCommitParserOptions: return ConventionalCommitParserOptions()
[docs] def log_parse_error(self, commit: Commit, error: str) -> ParseError: self._logger.debug(error) return ParseError(commit, error=error)
[docs] def commit_body_components_separator( self, accumulator: dict[str, list[str]], text: str ) -> dict[str, list[str]]: if (match := breaking_re.match(text)) and (brk_desc := match.group(1)): accumulator["breaking_descriptions"].append(brk_desc) return accumulator if (match := self.notice_selector.match(text)) and ( notice := match.group("notice") ): accumulator["notices"].append(notice) return accumulator if match := self.issue_selector.search(text): # if match := self.issue_selector.search(text): predicate = regexp(r",? and | *[,;/& ] *").sub( ",", match.group("issue_predicate") or "" ) # Almost all issue trackers use a number to reference an issue so # we use a simple regexp to validate the existence of a number which helps filter out # any non-issue references that don't fit our expected format has_number = regexp(r"\d+") new_issue_refs: set[str] = set( filter( lambda issue_str, validator=has_number: validator.search(issue_str), # type: ignore[arg-type] predicate.split(","), ) ) if new_issue_refs: accumulator["linked_issues"] = sort_numerically( set(accumulator["linked_issues"]).union(new_issue_refs) ) return accumulator # Prevent appending duplicate descriptions if text not in accumulator["descriptions"]: accumulator["descriptions"].append(text) return accumulator
[docs] def parse_message(self, message: str) -> ParsedMessageResult | None: return ( self.create_parsed_message_result(match) if (match := self.commit_msg_pattern.match(message)) else None )
[docs] def create_parsed_message_result( self, match: RegexMatch[str] ) -> ParsedMessageResult: parsed_break = match.group("break") parsed_scope = match.group("scope") or "" parsed_subject = match.group("subject") parsed_text = match.group("text") parsed_type = match.group("type") linked_merge_request = "" if mr_match := self.mr_selector.search(parsed_subject): linked_merge_request = mr_match.group("mr_number") parsed_subject = self.mr_selector.sub("", parsed_subject).strip() body_components: dict[str, list[str]] = reduce( self.commit_body_components_separator, [ # Insert the subject before the other paragraphs parsed_subject, *parse_paragraphs(parsed_text or ""), ], { "breaking_descriptions": [], "descriptions": [], "notices": [], "linked_issues": [], }, ) level_bump = ( LevelBump.MAJOR if body_components["breaking_descriptions"] or parsed_break else self.options.tag_to_level.get( parsed_type, self.options.default_bump_level ) ) return ParsedMessageResult( bump=level_bump, type=parsed_type, category=LONG_TYPE_NAMES.get(parsed_type, parsed_type), scope=parsed_scope, descriptions=tuple(body_components["descriptions"]), breaking_descriptions=tuple(body_components["breaking_descriptions"]), release_notices=tuple(body_components["notices"]), linked_issues=tuple(body_components["linked_issues"]), linked_merge_request=linked_merge_request, )
[docs] @staticmethod def is_merge_commit(commit: Commit) -> bool: return len(commit.parents) > 1
[docs] def parse_commit(self, commit: Commit) -> ParseResult: if not (parsed_msg_result := self.parse_message(force_str(commit.message))): return self.log_parse_error( commit, f"Unable to parse commit message: {commit.message!r}", ) return ParsedCommit.from_parsed_message_result(commit, parsed_msg_result)
# Maybe this can be cached as an optimization, similar to how # mypy/pytest use their own caching directories, for very large commit # histories? # The problem is the cache likely won't be present in CI environments
[docs] def parse(self, commit: Commit) -> ParseResult | list[ParseResult]: """ Parse a commit message If the commit message is a squashed merge commit, it will be split into multiple commits, each of which will be parsed separately. Single commits will be returned as a list of a single ParseResult. """ if self.options.ignore_merge_commits and self.is_merge_commit(commit): return self.log_parse_error( commit, "Ignoring merge commit: %s" % commit.hexsha[:8] ) separate_commits: list[Commit] = ( self.unsquash_commit(commit) if self.options.parse_squash_commits else [commit] ) # Parse each commit individually if there were more than one parsed_commits: list[ParseResult] = list( map(self.parse_commit, separate_commits) ) def add_linked_merge_request( parsed_result: ParseResult, mr_number: str ) -> ParseResult: return ( parsed_result if not isinstance(parsed_result, ParsedCommit) else ParsedCommit( **{ **parsed_result._asdict(), "linked_merge_request": mr_number, } ) ) # TODO: improve this for other VCS systems other than GitHub & BitBucket # Github works as the first commit in a squash merge commit has the PR number # appended to the first line of the commit message lead_commit = next(iter(parsed_commits)) if isinstance(lead_commit, ParsedCommit) and lead_commit.linked_merge_request: # If the first commit has linked merge requests, assume all commits # are part of the same PR and add the linked merge requests to all # parsed commits parsed_commits = [ lead_commit, *map( lambda parsed_result, mr=lead_commit.linked_merge_request: ( # type: ignore[misc] add_linked_merge_request(parsed_result, mr) ), parsed_commits[1:], ), ] elif isinstance(lead_commit, ParseError) and ( mr_match := self.mr_selector.search(force_str(lead_commit.message)) ): # Handle BitBucket Squash Merge Commits (see #1085), which have non angular commit # format but include the PR number in the commit subject that we want to extract linked_merge_request = mr_match.group("mr_number") # apply the linked MR to all commits parsed_commits = [ add_linked_merge_request(parsed_result, linked_merge_request) for parsed_result in parsed_commits ] return parsed_commits
[docs] def unsquash_commit(self, commit: Commit) -> list[Commit]: # GitHub EXAMPLE: # feat(changelog): add autofit_text_width filter to template environment (#1062) # # This change adds an equivalent style formatter that can apply a text alignment # to a maximum width and also maintain an indent over paragraphs of text # # * docs(changelog-templates): add definition & usage of autofit_text_width template filter # # * test(changelog-context): add test cases to check autofit_text_width filter use # # `git merge --squash` EXAMPLE: # Squashed commit of the following: # # commit 63ec09b9e844e616dcaa7bae35a0b66671b59fbb # Author: codejedi365 <codejedi365@gmail.com> # Date: Sun Oct 13 12:05:23 2024 -0600 # # feat(release-config): some commit subject # # Return a list of artificial commits (each with a single commit message) return [ # create a artificial commit object (copy of original but with modified message) Commit( **{ **deep_copy_commit(commit), "message": commit_msg, } ) for commit_msg in self.unsquash_commit_message(force_str(commit.message)) ] or [commit]
[docs] def unsquash_commit_message(self, message: str) -> list[str]: normalized_message = message.replace("\r", "").strip() # split by obvious separate commits (applies to manual git squash merges) obvious_squashed_commits = self.filters["git-header-commit"][0].split( normalized_message ) separate_commit_msgs: list[str] = reduce( lambda all_msgs, msgs: all_msgs + msgs, map(self._find_squashed_commits_in_str, obvious_squashed_commits), [], ) return list(filter(None, separate_commit_msgs))
def _find_squashed_commits_in_str(self, message: str) -> list[str]: separate_commit_msgs: list[str] = [] current_msg = "" for paragraph in filter(None, message.strip().split("\n\n")): # Apply filters to normalize the paragraph clean_paragraph = reduce(text_reducer, self.filters.values(), paragraph) # remove any filtered (and now empty) paragraphs (ie. the git headers) if not clean_paragraph.strip(): continue # Check if the paragraph is the start of a new conventional commit # Note: that we check that the subject has more than one word to differentiate from # a closing footer (e.g. "fix: #123", or "fix: ABC-123") if (match := self.commit_subject.search(clean_paragraph)) and len( match.group("subject").split(" ") ) > 1: # Since we found the start of the new commit, store any previous commit # message separately and start the new commit message if current_msg: separate_commit_msgs.append(current_msg) current_msg = clean_paragraph continue if not separate_commit_msgs and not current_msg: # if there are no separate commit messages and no current message # then this is the first commit message current_msg = dedent(clean_paragraph) continue # append the paragraph as part of the previous commit message if current_msg: current_msg += f"\n\n{dedent(clean_paragraph)}" # else: drop the paragraph continue return [*separate_commit_msgs, current_msg]