
    :jVH                         d Z ddlZddlZddlmZmZmZmZmZ ddl	m
Z
 ddlmZmZmZmZmZ ddlmZ  e
d      Z G d	 d
e      Zy)z9
Regex pattern parser with support for complex patterns.
    N)ListOptionalSetTupleUnion)
get_logger   )CharClassSegmentFixedSegmentGroupSegmentOptionalSegmentSegment)IRegexParserrefinec            
          e Zd ZdZddefdZdedee   fdZ	dededede
ee   ef   fd	Zdededede
ee   ef   fd
Zdededede
ee   ef   fdZdedee   fdZdedefdZdededede
eef   fdZdede
eeeef   f   fdZdededede
ee   ef   fdZdededede
eef   fdZdedefdZdedefdZdedefdZdedefdZdedefdZdee   ddfdZ y)RegexParserz,Parse regex patterns into segment sequences.max_quantifier_lengthc                     || _         y N)r   )selfr   s     4/root/.openclaw/workspace/harvester/refine/parser.py__init__zRegexParser.__init__   s
    %:"    patternreturnc                 t   |sg S | j                  |      }dt        |      }}t               }	 ||k  rA| j                  |||      \  }}|r"t        |      |_        |j                  |       nn||k  rA| j                  |       |S # t        $ r'}t        j                  d| d|        g cY d}~S d}~ww xY w)z"Parse regex pattern into segments.r   zFailed to parse pattern 'z': N)
_preprocess_patternlenlist_parse_nextpositionappend_calculate_prefix_lengths	Exceptionloggerwarning)r   r   preprocessedposlengthsegmentssegmentes           r   parsezRegexParser.parse   s    I //8\*V6	,#//c6J'*8}G$OOG, , **84O 	NN6wis1#FGI	s$   AB 4B 	B7B2,B72B7r(   r)   c                     ||k\  rd|fS ||   }|dk(  r| j                  |||      S |dk(  r| j                  |||      S |dv r| j                  |||      S | j                  |||      S )z)Parse next segment from current position.N([z.*+?{}^$|\\)_parse_group_parse_charclass_parse_special_parse_fixed)r   r   r(   r)   chars        r   r    zRegexParser._parse_next:   s    &=9s|3;$$Wc6::S[((#v>>^#&&wV<<$$Wc6::r   c                    |}|dz  }||k\  rd|fS d}d}||k  r||   dk(  rz|}|dz  }||k  rn||   dk(  r
d}d}|dz  }n\|dz   |k  r||   d	k(  r||dz      d
k(  r
d}|dz  }d}n7||k  r||   dvr|dz  }||k  r||   dvr||| }||k  r||   dk(  r|dz  }d}d}|}	||k  r0|dkD  r+||   dk(  r|dz  }n||   dk(  r|dz  }|dz  }||k  r|dkD  r+|dkD  rt         j                  d       d|fS ||	|dz
   dv r0t        fddD              st               }
d|
_        |
_        |
g}n&t        | j                        }|j                        }| j                  |||      \  }}|dk(  rt               }||_        ||_        ||fS t               }||_        ||_        | |_        |r||_        |r||_        ||fS )z3Parse group patterns (...) or (?:...) or (?-i) etc.r	   NF ?:Tz?:-iz?-i   z):r   r/   )z Unmatched parentheses in pattern|c              3   &   K   | ]  }|v  
 y wr    ).0r5   group_patterns     r   	<genexpr>z+RegexParser._parse_group.<locals>.<genexpr>   s     +ZdDM,A+Z   z	[]{}*+?())r%   r&   anyr   r!   contentr   r   r-   _parse_quantifierr   r   	capturingoriginal_prefix
quantifier)r   r   r(   r)   	start_posnon_capturingrI   prefix_startparen_countgroup_startchoice_segmentgroup_content
sub_parserrJ   r+   rB   s                  @r   r1   zRegexParser._parse_groupJ   sx   	q&=9 <GCLC/L1HCV|3<3&$(M&*O1HC1Wv%'#,#*='#PQ'BRVYBY&+O1HC$(M ,73<t+Cq ,73<t+C&-l3&?OV|(;q(, Fl{Qs|s"q $q 1HC Fl{Q ?NN=>9  cAg6 -+Zk+Z(Z)^N&'N#%2N"+,M %T%?%?@J&,,];M 00#vF
C%'G(G+GOC<"nG(G+GO$1 1G*9'%/"C<r   c                    |}|dz  }||k\  rd|fS d}||k  r#||   dk7  r|||   z  }|dz  }||k  r	||   dk7  r||k\  rt         j                  d       d|fS |dz  }| j                  |      }|sd|fS | j                  |||      \  }}| j	                  |      \  }}	| j                  |      }
t               }||_        ||_        ||_	        |	|_
        ||_        d| d|_        |
|_        ||fS )zParse character class [...]r	   Nr7   ]zUnclosed character classr0   )r%   r&   _parse_charsetrG   _quantifier_to_range_detect_case_sensitivityr
   r!   charset
min_length
max_lengthoriginal_quantifieroriginal_charset_strcase_sensitive)r   r   r(   r)   rK   class_contentrX   rJ   min_lenmax_lenr]   r+   s               r   r2   zRegexParser._parse_charclass   s@   	q&=9 Flws|s2WS\)M1HC Flws|s2 &=NN569q %%m49 00#vF
C44Z@ 66w?"$$!$$&0#)*=/';$!/|r   rF   c                 N   t               }d}t        |      }||k  r]|dz   |k  r||dz      dk(  r||   }||dz      }|dk(  r$|dz   |k  r| j                  ||dz            }|dz  }|dk(  r$|dz   |k  r| j                  ||dz            }|dz  }	 t        t	        |      t	        |      dz         D ]  }|j                  t        |              	 |dz  }n||   dk(  r{|dz   |k  rs| j                  ||dz            }|dk(  r|j                  d	       n?|d
k(  r|j                  d       n(|dk(  r|j                  d       n|j                  |       |dz  }n|j                  ||          |dz  }||k  r]|S # t        $ r t        j                  d| d|        Y w xY w)z1Parse character class content into character set.r   r<   r	   r:   \   zInvalid character range: d
0123456789w?abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_s 	
)setr   _unescape_charrangeordaddchr
ValueErrorr%   r&   update)	r   rF   charsr;   r)   startendcescapeds	            r   rU   zRegexParser._parse_charset   s   W&j1uv~'!a%.C"7
a!en D=QUV^ //A?EFA$;1q56>--ga!en=CFAN"3u:s3x!|< *		#a&)*
 Qt#A--ga!en=c>LL.^LL!bc^LL/IIg&Q 		'!*%QK &jN + " NNN%>ugQse#LMNs   A E= =$F$#F$r5   c           
      >    ddddddddd	d
	}|j                  ||      S )zUnescape special characters.
	rb   r:   rT   r0   )	ntrfvrb   r:   rT   r0   )get)r   r5   
escape_maps      r   rk   zRegexParser._unescape_char   s/    dDtSW^ahkruv
~~dD))r   c                     ||k\  rd|fS ||   }|dk(  r7|}||k  r||   dk7  r|dz  }||k  r	||   dk7  r||k  r|dz  }||| |fS d|fS |dv r	|dz  }||fS d|fS )zParse quantifier {n,m}, +, *, ?r7   {}r	   z+*?r@   )r   r   r(   r)   r5   rs   s         r   rG   zRegexParser._parse_quantifier   s    &=s7Ns|3;E,73<3#6q ,73<3#6V|quS)3..
 3w	 U]1HC93wr   rJ   c                    |sy|dk(  rd| j                   fS |dk(  rd| j                   fS |dk(  ry|j                  d      rr|j                  d	      ra|dd
 }	 d|v rH|j                  d      }|d   rt	        |d         nd}|d   rt	        |d         n
t        d      }||fS t	        |      }||fS y# t        $ r t        j                  d|        Y yw xY w)z#Convert quantifier to length range.)r	   r	   +r	   *r   r8   )r   r	   r   r   ,infzInvalid quantifier: )	r   
startswithendswithsplitintfloatrp   r%   r&   )r   rJ   rF   partsmin_valmax_valvals          r   rV   z RegexParser._quantifier_to_range  s    3t11223t11223""3'J,?,?,D 2&G'>#MM#.E/4Qxc%(mQG/4Qxc%(mU5\G#W--g,C:%
 	  !5j\BCs   AB4 $B4 4!CCc                 r   ||   }|dk(  r|dz   |k  r||dz      }|dk(  r|dz  }| j                  |||      \  }}| j                  |      \  }}t               }	|dz
  |	_        t	        t
        j                  t
        j                  z   dz         |	_        ||	_	        ||	_
        ||	_        d|	_        | j                  |      |	_        |	|fS |dk(  r|dz  }| j                  |||      \  }}| j                  |      \  }}t               }	|dz
  |	_        t	        t
        j                        |	_        ||	_	        ||	_
        ||	_        d|	_        | j                  |      |	_        |	|fS |||dz    }
|dz  }t               }	|dz
  |	_        |
|	_        |	|fS |dz  }t               }	|dz
  |	_        ||	_        |	|fS )	zIParse special characters and escape sequences preserving original format.rb   r	   rf   r<   _[a-zA-Z0-9_]rd   [0-9])rG   rV   r
   r!   rj   stringascii_lettersdigitsrX   rY   rZ   r[   r\   rW   r]   r   rF   )r   r   r(   r)   r5   	next_charrJ   r_   r`   r+   original_escapes              r   r3   zRegexParser._parse_special/  s   s|4<C!Gf,a(I Cq"&"8"8#v"N
C#'#<#<Z#H  +,#&7 "%f&:&:V]]&JS&P"Q%,"%,".8+/=,)-)F)Fw)O&|#c!q"&"8"8#v"N
C#'#<#<Z#H  +,#&7 "%fmm"4%,"%,".8+/6,)-)F)Fw)O&|# #*#a"8q&.#&7 "1|# 1HC"nG"QwG"GOC<r   c                     |}d}||k  r!||   dvr|||   z  }|dz  }||k  r||   dvrt               }||_        ||_        ||fS )z7Parse fixed string segment preserving escape sequences.r7   z()[].*+?{}^$|\\r	   )r   r!   rF   )r   r   r(   r)   rK   rF   r+   s          r   r4   zRegexParser._parse_fixedh  sm    	Flws|3EEws|#G1HC Flws|3EE .$!|r   c                 
    d|v S )z0Detect if pattern has (?-i) case sensitive flag.z(?-i)r@   )r   r   s     r   rW   z$RegexParser._detect_case_sensitivityv  s    '!!r   c                 J    | j                  |      }| j                  |      }|S )z;Preprocess pattern to handle shortcuts and negated classes.)_expand_shortcuts_convert_negated_classes)r   r   	processeds      r   r   zRegexParser._preprocess_patternz  s,     **73	 11)<	r   c                 T    g d}|}|D ]  \  }}t        j                  |||      } |S )zEExpand regex shortcuts like \d, \w, \s to explicit character classes.))z(?<!\[)\\d(?![^\[]*\])r   )z(?<!\[)\\D(?![^\[]*\])z[^0-9])z(?<!\[)\\w(?![^\[]*\])r   )z(?<!\[)\\W(?![^\[]*\])z[^a-zA-Z0-9_])z(?<!\[)\\s(?![^\[]*\])z[ \t\n\r\f\v])z(?<!\[)\\S(?![^\[]*\])z[^ \t\n\r\f\v]resub)r   r   replacementsresultoldnews         r   r   zRegexParser._expand_shortcuts  s<    

 $ 	.HCVVCf-F	. r   c                 @     d} fd}t        j                  |||      S )z:Convert negated character classes to positive equivalents.z\[\^([^\]]+)\]c                 H    | j                  d      }j                  |      S )Nr	   )group_negated_to_positive)matchnegated_contentr   s     r   replace_negatedz=RegexParser._convert_negated_classes.<locals>.replace_negated  s!    #kk!nO,,_==r   r   )r   r   negated_patternr   s   `   r   r   z$RegexParser._convert_negated_classes  s$     ,	> vvo@@r   r   c                 <   t               d}|t        |      k  r|dz   t        |      k  r_||dz      dk(  rT||   ||dz      }}t        t        |      t        |      dz         D ]  }j	                  t        |              |dz  }n#||   dk(  r|dz   t        |      k  r||dz      }|dk(  rj                  d       n|d	k(  r j                  t        j                         n|d
k(  r4j                  t        j                  t        j                  z   dz          nm|dk(  rj	                  d       nV|dk(  rj	                  d       n?|dk(  rj	                  d       n(|dk(  rj	                  d       nj	                  |       |dz  }nj	                  ||          |dz  }|t        |      k  rg }t        fdt        j                  D              s|j                  d       t        fdt        j                  D              s|j                  d       d}|D cg c]	  }|vs| }	}|	rDg }
|	D ],  }|dv r|
j                  d|z          |
j                  |       . |j                  |
       ddj                  |       dS c c}w )z?Convert negated character class content to positive equivalent.r   r<   r	   r:   rc   rb   rh   ri   rd   rf   r   /r~   ry   r}   rx   r   rz   c              3   &   K   | ]  }|v  
 y wr   r@   rA   ru   excludeds     r   rC   z3RegexParser._negated_to_positive.<locals>.<genexpr>  s     ?Q1=?rD   za-zA-Zc              3   &   K   | ]  }|v  
 y wr   r@   r   s     r   rC   z3RegexParser._negated_to_positive.<locals>.<genexpr>  s     8Q1=8rD   z0-9z!#$%&()*+,.:;<=>?@_`{|}~-z]^-\r0   r7   rT   )rj   r   rl   rm   rn   ro   rq   r   r   r   rE   r"   extendjoin)r   r   r;   rs   rt   ru   r   r   
safe_punctincluded_punctescaped_punctr   s              @r   r   z RegexParser._negated_to_positive  st    5#o&&1us?++A0F#0M,Q/Q1Gss5z3s8a<8 )ALLQ()Q #t+AO8L0L+AE2	#OOM2#%OOFMM2#%OOF$8$86==$H3$NO#%LL%#%LL&#%LL&#%LL&LL+Q_Q/0QA #o&&J  ?&*>*>??LL" 8&--88LL 1
%/E1H3D!EEM# ,<!((2!((+	,
 LL'2775>"!$$ Fs   .	J8Jr*   Nc                 &   d}|D ]  }||_         t        |t              r|t        |j                        z  }3t        |t
              r| j                  |j                         _t        |t              sp| j                  |j                          y)z/Calculate fixed prefix length for each segment.r   N)prefix_length
isinstancer   r   rF   r   r#   r   )r   r*   r   r+   s       r   r#   z%RegexParser._calculate_prefix_lengths  su     
	@G$1G!'<0W__!55G\2..w?G_5..w?
	@r   )   )!__name__
__module____qualname____doc__r   r   strr   r   r-   r   r   r    r1   r
   r2   r   rU   rk   rG   r   r   rV   r3   r   r4   boolrW   r   r   r   r   r#   r@   r   r   r   r      s   6;c ;S T'] 6;3 ;S ;# ;%QXHY[^H^B_ ; V C V c V 3 V 5RYIZ\_I_C` V p( (# (s (uXVfMgilMlGm (T-c -c#h -^*3 *3 *
 3  cSVh (s uS%U
BS=S7T 67 c 7  7 S 7 U8T[K\^aKaEb 7 rC c 3 5WZIZC[ " " "3 3   (
A 
A 
AA%C A%C A%F@$w- @D @r   r   )r   r   r   typingr   r   r   r   r   tools.loggerr   r+   r
   r   r   r   r   typesr   r%   r   r@   r   r   <module>r      sD    
  4 4 #   	H	\@, \@r   