
    :jy8                         d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ  ed      Z G d	 d
e      Zy)z
Splittability analyzer to determine if a regex pattern can be further split safely.
This module prevents infinite loops and ensures convergence in regex enumeration.
    N)ListTuple)
get_logger   )CharClassSegmentFixedSegmentGroupSegmentOptionalSegmentSegment)ISplittabilityAnalyzerrefinec                   B   e Zd ZdZ	 	 	 	 	 	 ddedededededefdZ	 dd	ed
e	e
   dedeeef   fdZd	edefdZd	edefdZd
e	e
   de	e   fdZdedefdZd
e	e   defdZd
e	e
   defdZdede	e
   defdZdede	e
   defdZd
e	e   defdZd
e	e   defdZy)SplittabilityAnalyzerz]Analyze whether a regex pattern can be safely split further based on mathematical principles.enable_recursion_limitenable_value_thresholdenable_resource_limitmax_recursion_depthmin_enumeration_valuemax_resource_costc                 X    || _         || _        || _        || _        || _        || _        y )N)r   r   r   r   r   r   )selfr   r   r   r   r   r   s          ;/root/.openclaw/workspace/harvester/refine/splittability.py__init__zSplittabilityAnalyzer.__init__   s6     '=#&<#%:" $7 %:"!2    patternsegmentscurrent_depthreturnc                 Z   t         j                  d|dd  t        |      dkD  rdnd        | j                  |      ry| j	                  |      ry| j                  |      }|sy| j                  |      sy	| j                  r|| j                  k\  rd
d| fS | j                  r(| j                  |      }|| j                  k  rd
d|dfS | j                  r(| j                  |      }|| j                  kD  rd
d|dfS dt        |       d}t         j                  d|        d|fS )aD  
        Determine if a regex pattern can be split further based on mathematical principles.
        Time complexity: O(n) where n is the number of segments.

        Core conditions (always checked):
        1. Unsupported features check
        2. Basic cycle detection
        3. Enumerability check
        4. Convergence analysis

        Args:
            pattern: The regex pattern to analyze
            segments: Parsed segments of the pattern
            current_depth: Current recursion depth

        Returns:
            Tuple[bool, str]: (can_split, reason)
        zAnalyzing pattern: N2   z... )Fz%Pattern contains unsupported features)Fz"Cycle detected in pattern analysis)FzNo enumerable segments found)FzEnumeration will not convergeFzRecursion limit reached: zValue threshold not met: .3fzResource limit exceeded: z.2fzSplittable: z enumerable segmentszPattern analysis passed: T)loggerdebuglen_has_unsupported_features_detect_basic_cycle_find_enumerable_segments_will_converger   r   r   _calc_total_valuer   r   _estimate_costr   )r   r   r   r   enumerable_segmentstotal_valuecostreasons           r   	can_splitzSplittabilityAnalyzer.can_split.   s_   0 	*73B<.#g,QSBSY[8\]^ ))'2A ##G,> #<<XF"8 ""#679 &&=D<T<T+T5m_EEE&&001DEKT777 9+c9JKKK%%&&':;Dd,,, 9$sDDD$7 899MN09:V|r   c                    d|v rJt        j                  d|      rt        j                  d       yd|v sd|v rt        j                  d       yd|v rt        j                  d	       yd
|v sd|v rt        j                  d       yy)a7  
        Check for truly unsupported regex features that prevent enumeration entirely.
        Time complexity: O(1) amortized.

        Note: Many advanced features like lookahead, lookbehind, word boundaries, and
        possessive quantifiers are actually supported if there are enumerable segments.
        \z\\[1-9]z-Backreference detected - prevents enumerationTz\p{z\P{z)Unicode property detected - not supportedz(?(z/Conditional expression detected - not supportedz(?R)z(?0)*Recursive pattern detected - not supportedFresearchr#   r$   )r   r   s     r   r&   z/SplittabilityAnalyzer._has_unsupported_featuresk   s     7?yyW-LM  Fg$5HI GLLJK W' 1LLEFr   c                 t    g d}|D ]/  }t        j                  ||      st        j                  d        y y)z5Basic cycle detection for obvious recursive patterns.)z	\(.+\)\\1z
\(.*\)\.\*z\(.+\)\+r3   TFr4   )r   r   recursive_patternsrecursive_patterns       r   r'   z)SplittabilityAnalyzer._detect_basic_cycle   s@    
 "4 	yy*G4IJ	
 r   c                 *   g }t        |      }|r|j                         }t        |t              r#| j	                  |      rO|j                  |       n=t        |t        t        f      r'|j                  r|j                  |j                         |r|S )z<Find segments that can be enumerated. Time complexity: O(n).)
listpop
isinstancer   _is_enumerableappendr	   r
   contentextend)r   r   
enumerablestacksegments        r   r(   z/SplittabilityAnalyzer._find_enumerable_segments   sz    
 XiikG'#34&&w/%%g.GlO%DE??LL1  r   rD   c                     |j                   syt        |j                         }|dk  ry|dkD  ry|j                  }|dk(  ryy)z
        Check if a segment can be enumerated to reduce search space.
        Focus on whether enumerating 2-3 characters would be beneficial.
        Time complexity: O(1).
        Fr      r   T)charsetr%   
max_length)r   rD   charset_sizemax_lens       r   r>   z$SplittabilityAnalyzer._is_enumerable   sT     7??+ 1 # $$ a< r   c                     |sy|D ]6  }t        |j                        }|dkD  st        j                  d|         y y)a  
        Check if enumeration will converge based on mathematical principles.
        Core conditions: 1) Enumerable charset exists 2) Search space reduces after enumeration
        Time complexity: O(1) - simplified for mathematical convergence only.
        Fr   z%Convergence guaranteed: charset_size=T)r%   rG   r#   r$   )r   r   rD   rI   s       r   r)   z$SplittabilityAnalyzer._will_converge   sO        	Gw/La D\NST	 r   c                    d}|D ]  }t        |t              rzt        |j                        }|dk  r.d}|j                  }|j
                  }|t        d      k(  rt        |d      }n||z   dz  }|dkD  sq||t        j                  |      z  z  }t        |t              r|t        j                  d      z  }t        |t              s|j                  s|| j                  |j                        z  } |S )zUCalculate search space in logarithmic space to avoid overflow. Time complexity: O(n).        r         ?inf      )r=   r   r%   rG   
min_lengthrH   floatmaxmathlogr
   r	   r@   _calc_log_search_space)r   r   	log_totalrD   rI   
avg_lengthmin_lenrJ   s           r   rW   z,SplittabilityAnalyzer._calc_log_search_space   s    	 	NG'#34"7??31$ !
!,,!,,eEl*!$WaJ")G"3q!8J  !#dhh|.D!DDIG_5TXXa[(	 G\2??!<!<W__!MMI;	N> r   all_segmentsc                 "   t        |j                        }|dk  ry| j                  ||      }t        d|j                        }|dk  rd}n|dk  rd}n
|dk  rd	}nd
}||dz  z  }d|dz  z   }||z  }t
        j                  d| d|d       |S )z
        Calculate the benefit of enumerating this segment.
        Returns the expected reduction in search space complexity.
        Time complexity: O(1).
        r   rM   rP   
   rN      g?>   g333333?g333333?g?z%Enumeration benefit for charset size z: r"   )r%   rG   _calc_context_valueminrR   r#   r$   )	r   rD   r[   rI   context_valueenum_positionsbase_benefitcontext_multipliertotal_benefits	            r   _calc_enumeration_benefitz/SplittabilityAnalyzer._calc_enumeration_benefit  s     7??+1 00,G Q 2 23 2LRLRLL 	,, !MC$78$'99<\N"][^L_`ar   c                     |j                   }d}|D ]R  }t        |t              s|j                   }t        |j                        }t        ||z
        }dd|z   z  }	|||	z  z  }T t        |d      S )z8Calculate how much fixed context surrounds this segment.rM   rN   g      @)positionr=   r   r%   r@   absra   )
r   rD   r[   ri   rb   other_segment	other_poscontent_lendistanceweights
             r   r`   z)SplittabilityAnalyzer._calc_context_value7  s    ## * 	6M-6)22	!-"7"78 y834h/v!55	6 =#&&r   c                    d}|D ]w  }|j                   dkD  r||j                   z  }"t        |j                        }|dkD  s=t        j                  |      }t        |j                  dz  d      }|d|z   z  }||z  }y |S )zCFast calculation of total enumeration value. Time complexity: O(n).rM   r   r   g?rN   )valuer%   rG   rU   rV   ra   ri   )r   r   r-   segrI   
base_valueposition_boosts          r   r*   z'SplittabilityAnalyzer._calc_total_valueI  s     	.Cyy1}syy(  #3;;/!#!%,!7J &));S%AN!n"44J:-K	." r   c                    |st        d      S d}|D ]r  }t        |j                        }t        j                  |dz         }|j
                  }|t        d      k(  r|dz  }n|t        j                  |dz         dz  z  }||z  }t |S )z,Fast cost estimation. Time complexity: O(n).rO   rM   r   g      @g      ?)rS   r%   rG   rU   rV   rH   )r   r   
total_costrr   rI   	base_costrJ   s          r   r+   z$SplittabilityAnalyzer._estimate_cost`  s    <
 	$Cs{{+L !12I nnG%,&S 	TXXgk2S88	)#J	$ r   N)FFFr]   g{Gz?g      I@)r   )__name__
__module____qualname____doc__boolintrS   r   strr   r   r   r0   r&   r'   r   r(   r>   r)   rW   rg   r`   r*   r+    r   r   r   r      s   g (-',&+#%'+#'3 $3 !%3  $	3
 !3  %3 !3. 	;; w-; 	;
 
tSy	;z  >3 4  $w- DIYDZ ( &6  4  Dt,<'= $ ,#tG} # #J*1A *QUV]Q^ *ch *X'+; '4PW= ']b '$$/?*@ U .t,<'= % r   r   )r{   rU   r5   typingr   r   tools.loggerr   rD   r   r   r	   r
   r   typesr   r#   r   r   r   r   <module>r      s@   
  	  #  *	H	^2 ^r   