from __future__ import annotations import json import re from transformers.utils import is_jmespath_available if is_jmespath_available(): import jmespath else: jmespath = None def _parse_re_match(node_match: re.Match) -> dict | str: # If the regex has named groups, return a dict of those groups if node_match.groupdict(): return {key: val for key, val in node_match.groupdict().items() if val is not None} # Otherwise the regex must have exactly one unnamed group, and we return that else: groups = list(node_match.groups()) if len(groups) > 1: raise ValueError(f"Regex has multiple unnamed groups!\nGroups: {groups}\n") elif len(groups) == 0: raise ValueError(f"Regex has no capture groups:\n\n{node_match.group(0)}") return groups[0] def recursive_parse( node_content: str | list | dict, node_schema: dict, ): """ This function takes content and a JSON schema which includes regex extractors, and recursively parses the content. The output should be a data structure matching the schema. Args: node_content: The content corresponding to this node. Usually a string, but can be something else if the parent node has multiple capture groups or named groups. In that case, we generally pass the capture groups straight through to the children of this node and don't do any parsing at this level. node_schema: The schema node controlling the parsing. Returns: The parsed data structure for the current node. """ # If the schema has a const, we just return that value and do absolutely nothing else if "const" in node_schema: return node_schema["const"] # If the node content is None, we return None. EZ. if node_content is None: return None # If not, we have to do a little parsing. First, set some vars and do basic validation node_type = node_schema.get("type") has_regex = "x-regex" in node_schema or "x-regex-iterator" in node_schema or "x-regex-key-value" in node_schema if has_regex and not isinstance(node_content, str): raise TypeError( "Schema node got a non-string input, but has a regex for parsing.\n" f"Input: {node_content}\n" f"Schema: {node_schema}" ) node_regex = node_schema.get("x-regex") node_regex_iterator = node_schema.get("x-regex-iterator") node_regex_to_dict = node_schema.get("x-regex-key-value") if node_regex is not None: node_match = re.search(node_regex, node_content, flags=re.DOTALL) if not node_match: return None node_content = _parse_re_match(node_match) if node_regex_iterator is not None: if node_type != "array": raise TypeError(f"Schema node with type {node_type} cannot use x-regex-iterator.\nSchema: {node_schema}") # Note that this can be applied after a standard node-regex search node_content = [ _parse_re_match(node_match) for node_match in re.finditer(node_regex_iterator, node_content, flags=re.DOTALL) ] if not node_content: return None if node_regex_to_dict is not None: if node_type != "object": raise TypeError(f"Schema node with type {node_type} cannot use x-regex-key-value.\nSchema: {node_schema}") # Note that this can be applied after a standard node-regex search output_content = {} for node_match in re.finditer(node_regex_to_dict, node_content, flags=re.DOTALL): match_groups = _parse_re_match(node_match) if not isinstance(match_groups, dict) or "key" not in match_groups or "value" not in match_groups: raise ValueError( f"Regex for x-regex-key-value must have named groups 'key' and 'value'.\n" f"Match groups: {match_groups}\n" f"Schema: {node_schema}" ) output_content[match_groups["key"]] = match_groups["value"] node_content = output_content if not node_content: return None # Next, if the node has a parser, apply it. We do this after regexes so that the regex can extract # a substring to parse, if needed. if "x-parser" in node_schema: parser = node_schema["x-parser"] if parser == "json": if not isinstance(node_content, str): raise TypeError( f"Node has JSON parser but got non-string input: {node_content}\nSchema: {node_schema}" ) parser_args = node_schema.get("x-parser-args", {}) transform = parser_args.get("transform") allow_non_json = parser_args.get("allow_non_json", False) try: parsed_json = json.loads(node_content) except json.JSONDecodeError as e: if allow_non_json: parsed_json = node_content else: raise ValueError( f"Node has JSON parser but could not parse its contents as JSON. You can use the `allow_non_json` parser arg for nodes which may contain JSON or string content.\n\nContent: {node_content}\n\nError: {e}" ) if transform is not None: if jmespath is None: raise ImportError( "Chat response schema includes a jmespath transformation, but jmespath is not installed. You can install it with `pip install jmespath`." ) parsed_json = jmespath.search(parser_args["transform"], parsed_json) node_content = parsed_json else: raise ValueError(f"Unknown parser {parser} for schema node: {node_schema}") # If there's a mapping, apply it now if "x-mapping" in node_schema: if not isinstance(node_content, str): raise TypeError( f"Schema node with type {node_type} cannot use x-mapping on non-string content.\n" f"Content: {node_content}\n" f"Schema: {node_schema}" ) mapping = node_schema["x-mapping"] if node_content in mapping: node_content = mapping[node_content] if "x-mapping-regex" in node_schema: if not isinstance(node_content, str): raise TypeError( f"Schema node with type {node_type} cannot use x-mapping-regex on non-string content.\n" f"Content: {node_content}\n" f"Schema: {node_schema}" ) mapping_regex = node_schema["x-mapping-regex"] for pattern, replacement in mapping_regex.items(): node_content = re.sub(pattern, replacement, node_content, flags=re.DOTALL) # Finally, handle parsed content based on schema type and recurse if required if node_type == "object": parsed_schema = {} if isinstance(node_content, str): # This means we don't have a regex at this level, so all of our child nodes need to parse the whole # string themselves to extract their value. if "properties" not in node_schema: raise ValueError( f"Object node received string content but has no regex or parser to handle it.\n" f"Content: {node_content}\n" f"Schema: {node_schema}" ) for key, child_node in node_schema["properties"].items(): child_node_content = recursive_parse(node_content, node_schema["properties"][key]) if child_node_content is not None: parsed_schema[key] = child_node_content return parsed_schema elif isinstance(node_content, dict): for key, child_node in node_schema.get("properties", {}).items(): if "const" in child_node: parsed_schema[key] = child_node["const"] elif key in node_content: parsed_schema[key] = recursive_parse(node_content[key], child_node) elif "default" in child_node: parsed_schema[key] = child_node["default"] if "additionalProperties" in node_schema: for key, value in node_content.items(): if key not in node_schema.get("properties", {}): parsed_schema[key] = recursive_parse(value, node_schema["additionalProperties"]) return parsed_schema else: raise TypeError(f"Expected a dict or str for schema node with type object, got {node_content}") elif node_type == "array": if not node_content: return [] parsed_schema = [] if "items" in node_schema: if not isinstance(node_content, list): raise TypeError(f"Expected a list or regex for schema node with type array, got {node_content}") for item in node_content: parsed_schema.append(recursive_parse(item, node_schema["items"])) return parsed_schema elif "prefixItems" in node_schema: if not isinstance(node_content, list): if len(node_schema["prefixItems"]) == 1: # If there's only one prefix item, this is a single item array, we can just wrap the string node_content = [node_content] else: raise TypeError(f"Expected a list or regex for schema node with type array, got {node_content}") if len(node_content) != len(node_schema["prefixItems"]): raise ValueError( f"Array node has {len(node_content)} items, but schema only has " f"{len(node_schema['prefixItems'])} prefixItems defined.\n" f"Content: {node_content}\n" f"Schema: {node_schema}" ) for item, item_schema in zip(node_content, node_schema["prefixItems"]): parsed_schema.append(recursive_parse(item, item_schema)) return parsed_schema else: raise ValueError(f"Array node has no items or prefixItems schema defined.\nSchema: {node_schema}") elif node_type in ("string", "integer", "number", "boolean"): if not isinstance(node_content, str): raise TypeError(f"Expected a string for schema node with type {node_type}, got {node_content}") if node_type == "integer": return int(node_content) elif node_type == "number": return float(node_content) elif node_type == "boolean": if node_content.lower() in ("true", "1"): return True elif node_content.lower() in ("false", "0"): return False else: raise ValueError(f"Invalid boolean value: {node_content}") else: # String type return node_content elif node_type is None: return node_content # Don't touch it else: raise TypeError(f"Unsupported schema type {node_type} for node: {node_content}")