
    :j                         d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ  ed      Z G d	 d
e      Zy)z0
Query generator for enumerated regex patterns.
    N)ListSet)
get_logger   )CharClassSegmentEnumerationStrategyFixedSegmentGroupSegmentOptionalSegmentSegment)IQueryGeneratorrefinec                      e Zd ZdZd6defdZd7dee   dededee	   fdZ
dee   d	edefd
ZdedefdZdee   dededee   fdZdedededefdZdee	   dedeee	      fdZd8de	dede	fdZdee	   de	fdZdedededee	   fdZde	defdZ	 d9dee   dedededee	   f
dZ	 d8dee   dee   dedee	   fd Zd7dededee	   fd!Zd"e	de	fd#Zdedefd$Zdee   dede	dee   fd%Zdededefd&Zded'edefd(Zd)e	d'edede	fd*Zdee   de	fd+Zd,e	de	fd-Z d.e	de	fd/Z!d0e	defd1Z"d.e	d2ede#e	   fd3Z$dede	fd4Z%y5):QueryGenerator+Generate queries from enumeration strategy.	max_depthc                     || _         y N)r   )selfr   s     7/root/.openclaw/workspace/harvester/refine/generator.py__init__zQueryGenerator.__init__   s	    "    segmentsstrategy
partitionsreturnc           
         |j                   s| j                  |      }|r|gS g S |j                   D ]^  }|j                  r|j                  |_        !| j                  |j                        }| j                  ||j                        }||_        ` 	 |dkD  r| j                  |j                   |      }| j                  |j                   ||      }	t               }
t        |j                   |	      D ]*  \  }}| j                  ||||      }|
j                  |       , t        |
      }t        j!                  dt#        |       d| dt#        |j                          d       |S t#        |j                         dk(  rC| j                  ||j                   d         }
t        j!                  dt#        |
       d       |
S | j%                  ||j                   d	      }
t        j!                  dt#        |
       d
t#        |j                          d       |
S # t&        $ r:}t        j)                  d|        | j                  |      }|sg n|gcY d}~S d}~ww xY w)r   r   
Generated z unique queries with depth z from z partsr   z' queries for single segment enumerationT)separatez queries for z" segments enumeration (separately)zQuery generation failed: N)r   _reconstruct_patterncase_sensitivecharseteffective_charset_expand_charset_shortcutsoriginal_charset_str_parse_charset_to_set_calculate_min_depth_for_target_split_partssetzip!_generate_queries_for_single_partupdatelistloggerinfolen$_generate_queries_for_multiple_parts	Exceptionwarning)r   r   r   r   querysegmentexpandedr"   depth
parts_listqueriestarget_segmentpartspart_queriesresultes                   r   generatezQueryGenerator.generate    sQ     --h7E#E7++  (( 	4G%%,3OO)  99':V:VW44Xw?U?UV,3)	4(	0A~ <<X=N=NPZ[!..x/@/@%T
% .11B1BJ-O 1)NE#'#I#I(Tbdikp#qLNN<01 g V-HzZ]^f^o^oZpYqqwx 
 x(()Q."DDXxO`O`abOcdGKK*S\N:a bc"N #GGRZRcRcnrGsGKK$S\N-HDUDU@V?WWyz #N 	0NN6qc:;--h7E"2/	0s-   CH &AH AH 	I"/IIItarget_queriesc                    |r|dk  ryg }|D ]s  }|j                   dk(  rd}nN|j                   t        d      k(  r| j                  }n)t        t	        |j                         | j                        }|j                  |       u t        dt        |      dz         D ]  }d}t        |      D ]Z  \  }}|j                  rt        |j                        nt        |j                        }	|	dkD  sBt        |||         }
|	|
z  }||z  }\ ||k\  sst        j                  d| d| dt        |       d|        |c S  |rt        |      nd}t        j                  d| d	|        |S )
zGCalculate minimum enumeration depth needed to reach target query count.r   infr   zCalculated min depth z for target z queries with z segments, total_queries=zCannot satisfy target z, using max depth )
max_lengthfloatr   minintappendrangemax	enumerater#   r0   r"   r.   debug)r   r   r@   
max_depthsr5   r   r7   total_queriesicharset_sizeeffective_depthsegment_queriesr=   s                r   r'   z.QueryGenerator._calculate_min_depth_for_target\   s   >Q. 
 		)G !!Q&	##uU|3 NN	G$6$6 7H	i(		) 1c*o12 	EM'1 5
7AHAZAZs7#<#<=`cdkdsds`t!#&)%A&?O&2O&CO!_4M5 .+E7,~>N OM?*CM?T 	$ %/ZA-n-==OPVxXYr   r5   c                     |j                   dk(  ry|j                   t        d      k(  r| j                  S t        t	        |j                         | j                        S )z6Return maximum usable enumeration depth for a segment.r   rB   )rC   rD   r   rE   rF   )r   r5   s     r   
_max_depthzQueryGenerator._max_depth   sK    "u->>!3w))*DNN;;r   r7   r;   c                 *   |sg S |dk  r|gt        |      z  S t        |      dk(  r|gS g }|D ]~  }|j                  rt        |j                        nt        |j                        }|dk  r|j                  d       Pt	        || j                  |            }|j                  ||z          t        |      }|dk  r't        d|t        |      z        }	|	gt        |      z  S |D 
cg c]
  }
||
z  |z   c}
D cg c]*  }t        dt        t        j                  |                  , }}|t        |      z
  }|dkD  rDt        t        t        |            fdd      }|D ]  }|dk  r |S ||xx   dz  cc<   |dz  } |S c c}
w c c}w )zCSplit partitions across segments by theoretical combination weight.r   r   c                 @    |    t        j                  |          z
  S r   )mathfloor)rN   raws    r   <lambda>z-QueryGenerator._split_parts.<locals>.<lambda>   s     s1v

SVWXSYHZ?Z r   T)keyreverse)r0   r#   r"   rG   rE   rS   sumrI   rF   rV   rW   sortedrH   )r   r   r7   r;   sizesr5   cdtotalbasesxallocremorderrN   rX   s                   @r   r(   zQueryGenerator._split_parts   s   IA:7S]**x=A7N 	G292K2KG--.QTU\UdUdQeAAvQE4??734ALLA	 E
A:q%3x=01D6CM))*/0Quqy5 0589QDJJqM*+99c%j 75X/5ZdhiE !8  aAq	  19s   +F/Fc                     |j                   rt        |j                         nt        |j                        }|dk  s
|dk  s|dk  ry||z  }t        d||z        }t	        ||      S )z,Compute last-layer group size for a segment.r   r   )r#   r0   r"   rI   rE   )r   r5   r7   r;   r_   ra   sizes          r   _inner_sizezQueryGenerator._inner_size   sc    .5.G.GC))*SQXQ`Q`Ma6UaZ5A:51eun%1d|r   charsri   c                 n    |dk  rd}t        dt        |      |      D cg c]
  }||||z     c}S c c}w )z#Split chars into fixed-size groups.r   r   )rH   r0   )r   rk   ri   rN   s       r   _split_charszQueryGenerator._split_chars   s:    19D-21c%j$-GHa!d(#HHHs   2r_   flagc                 J    |dk(  ry|dk(  ry|r|dv rd|z   S |S |dv rd|z   S |S )z#Escape a character for regex usage./\/\z\\z-]^z.^$|?*+(){}[] )r   r_   rn   s      r   _escapezQueryGenerator._escape   sE    89EzaxH  !8Or   groupc                      t        |      dk(  r j                  |d         S dj                   fd|D              }d| dS )z2Build a regex character class string from a group.r   r    c              3   D   K   | ]  }j                  |d         yw)T)rn   N)rt   ).0r_   r   s     r   	<genexpr>z,QueryGenerator._build_str.<locals>.<genexpr>   s     D$,,qt,4Ds    [])r0   rt   join)r   ru   contents   `  r   
_build_strzQueryGenerator._build_str   sC    u:?<<a))''DeDD7)1~r   c                    t        t        |j                        xs g       }|sdgS t        || j	                  |            }| j                  |||      }| j                  ||      }|dk(  r|D cg c]  }| j                  |       c}S g }	t        j                  ||dz
        D ]L  }
| j                  dj                  |
            }|D ]%  }|	j                  || j                  |      z          ' N |	S c c}w )z3Generate grouped prefix combinations for a segment.rw   r   repeat)r]   r-   r#   rE   rS   rj   rm   r   	itertoolsproduct_escape_regex_charsr}   rG   )r   r5   r7   r;   rk   r`   innergroupsgcombosprepre_strs               r   _grouped_comboszQueryGenerator._grouped_combos   s    tG556<"=4Ktw/0  !U3""5%060671DOOA&77$$U1q59 	<C..rwws|<G <g(::;<	<
  8s   1C9
enum_valuec                    |syd}d}|t        |      k  r||   }|dk(  rs|dz   }|t        |      k  r<||   dk(  r|dz   t        |      k  r|dz  }-||   dk(  rn|dz  }|t        |      k  r<|dz  }|t        |      k  r|dz   n
t        |      }n+|dk(  r|dz   t        |      k  r|dz  }|dz  }n
|dz  }|dz  }|t        |      k  r|S )z3Return how many characters a prefix regex consumes.r   r{   r   rr      r|   )r0   )r   r   rN   countchjs         r   	_enum_lenzQueryGenerator._enum_len   s   #j/!ABSyE#j/)!!},QZ1HQ !!}+FA #j/) 
Z0AEc*otAJ 7
Q
Q) #j/!, r   targetc                    |dkD  r|dkD  r| j                  |||      }n)|dkD  r| j                  ||      }n| j                  |      }g }|D ]O  }| j                  |||      }| j                  |      }	|	s*|	| j                  |      k7  s?|j	                  |	       Q |sA|r?|D ]:  }| j                  |||      }| j                  |      }	|	s*|j	                  |	       < |S )zLGenerate queries by enumerating target segment with optional specific depth.r   )r   _generate_segment_combinations_apply_single_enumerationr    rG   )
r   r   r   r7   r;   r   r9   combonew_segmentspatterns
             r   r+   z0QueryGenerator._generate_queries_for_single_part  s     19))&%?FQY88GF88@F 	(E99(FERL//=G7d&?&?&IIw'	( 6 ,#==hPUV33LANN7+	, r   targetsr   c           	         |r,g }t        |      D ]  \  }}t        j                  d| d|j                   d|j                          | j                  ||      }t        j                  dt        |       d|        |rt        j                  d| d|d           |j                  |        t        j                  d	t        |              t               }g }	|D ])  }
|
|vs|j                  |
       |	j                  |
       + t        j                  d
t        |	              |	S g }|D ]$  }| j                  |      }|j                  |       & g }t        j                  | D ]w  }|j                         }t        |      D ]  \  }}||   }| j                  |||      } | j!                  |      }|sR|| j!                  |      k7  sg|j                  |       y |S )aM  Generate queries by enumerating multiple segments.

        Args:
            segments: List of all segments
            targets: List of target segments to enumerate
            separate: If True, enumerate each segment separately (union).
                     If False, enumerate all segments together (Cartesian product).
        zEnumerating target segment z: position=, value=r   z queries for segment zSample query for segment z: r   z$Total queries before deduplication: z$Unique queries after deduplication: )rJ   r.   rK   positionvaluer+   r0   extendr)   addrG   r   r   r   copyr   r    )r   r   r   r   all_queriesrN   r   r@   seenunique_queriesr4   all_combinationscombinationsr9   combo_tupler   r   r   s                     r   r1   z3QueryGenerator._generate_queries_for_multiple_parts+  s    K 'w/ 3	6:1#[HYYabhbnbnaopq!%!G!GRX!Yz#n*=)>>STUSVWX!LL#<QCr.QRBSAT!UV"">23 LL?K@P?QRS 5DN$ 1$HHUO"))%01
 LL?N@S?TUV!!  "! 6#BB6J ''56
 G(002BC 	,'}}!*7!3 dIAv!,QJ#'#A#A,PVXb#cLd 33LAw$*C*CH*MMNN7+	, Nr   c           	         t        t        |j                        xs g       }|dkD  r||sdgS g }t        j                  ||      D ]5  }dj                  |      }| j                  |      }|j                  |       7 t        j                  dt        |       d|        |S t        dt        d| j                  |                  }|dk(  rdgS g }t        j                  ||      D ]5  }dj                  |      }| j                  |      }|j                  |       7 t        j                  dt        |       d|        |S )zFGenerate combinations for single segment with optional specific depth.r   rw   r   r   z combinations for depth    r   )r]   r-   r#   r   r   r}   r   rG   r.   r/   r0   rE   rI   _calculate_optimal_depth)	r   r5   r7   r"   r   r   	combo_strescaped_combooptimal_depths	            r   r   z-QueryGenerator._generate_segment_combinationsf  sL    g778>B?19t L"**75A 3GGEN	 $ 8 8 C##M23
 KK*S%6$77OPUwWX  3q$*G*G*P#QRM!t L"**7=I 3GGEN	 $ 8 8 C##M23
 KK*S%6$77OP]_`r   combinationc                 2    |s|S |j                  dd      }|S )zFEscape special regex characters in combination to ensure valid syntax.rp   rq   replace)r   r   escapeds      r   r   z"QueryGenerator._escape_regex_chars  s&     %%c51
 r   c           	         |j                   rt        |j                         nt        |j                        }|dk(  ry|j                  dkD  r|j                  dkD  rt	        d| j
                        }ny|j                  dkD  rt	        d| j
                        }nS|j                  dkD  rt	        d| j
                        }n-t	        d| j
                        }nt	        d| j
                        }t	        ||j                        }|dk  rt	        |dz   | j
                        }n|d	k\  rt        d|dz
        }t        j                  d
| d| d|j                  d       |S )zCCalculate optimal enumeration depth based on mathematical analysis.r         
   r      r   r   2   zCalculated optimal depth z for charset_size=r   z.3f)
r#   r0   r"   r   rE   r   
min_lengthrI   r.   rK   )r   r5   rO   target_depths       r   r   z'QueryGenerator._calculate_optimal_depth  s;   9@9R9Rs7445X[\c\k\kXl1 ==1}}r!"1dnn5#"1dnn5""1dnn5"1dnn5 q$..1L <););< 2|a/@LRq,"23L'~5G~Uabibobopsatu	
 r   c                    g }|D ]g  }| j                  ||      rC|r-t               }|j                  |_        ||_        |j	                  |       | j                  |      }t        d|j                  |z
        }t        d|j                  |z
        }	|j                  dv xs |	dkD  }
|
st               }|j                  |_        |j                  j                         |_        |j                  |_        |j                  |_        ||_        |	|_        | j                  |j                  ||      |_	        |j	                  |       G|j	                  |       Zt!        |t"              rt#               }|j                  |_        |j$                  |_        t'        |dd      x}r||_        t'        |dd      x}r||_        | j-                  |j                  ||      |_        |j	                  |       t!        |t.              rPt/               }|j                  |_        | j-                  |j                  ||      |_        |j	                  |       W|j	                  |       j |S )zHApply enumeration to only the target segment, including nested segments.r   )+*original_prefixN
quantifier)_is_target_segmentr	   r   r~   rG   r   rI   r   rC   original_quantifierr   r"   r   r%   r!   _adjust_quantifier
isinstancer
   	capturinggetattrr   r   r   r   )r   r   r   r   r   r5   	fixed_seglengthremaining_minremaining_maxshould_create_remainingremaining_seg	new_groupr   r   new_optionals                   r   r   z(QueryGenerator._apply_single_enumeration  sE     7	-G&&w7 ,I)0)9)9I&(2I% ''	2 "^^J7F$'7+=+=+F$GM$'7+=+=+F$GM /6.I.IZ.W.l[hkl[l+.(8(:181A1A.070D0D0F-=D=Y=Y:7>7M7M43@03@0<@<S<S#77"#=9
 %++M: !''0G\2(N	%,%5%5	"&-&7&7	#&-g7H$&OO?O0?I-!(,!EE:E+5I($($B$B7??TZ\f$g	!##I.G_5.0(/(8(8%'+'E'EgooW]_i'j$##L1 ##G,o7	-r r   c                     t        |t              sy|j                  |j                  k(  xrO |j                  |j                  k(  xr4 |j                  |j                  k(  xr |j
                  |j
                  k(  S )z4Check if segment is the target segment to enumerate.F)r   r   r   r"   r   rC   )r   r5   r   s      r   r   z!QueryGenerator._is_target_segment   sr    '#34 / 86>>18""f&7&778 ""f&7&77		
r   enum_lengthc                 p   |j                   dk(  rt        d|j                  |z
        }|S |j                   dk(  rt        d|j                  |z
        }|S |j                   dk(  rt        dd|z
        S |j                   j                  d      rt        d|j                  |z
        }|S t        dd|z
        S )z5Calculate remaining minimum length after enumeration.r   r   r   ?r   {)r   rI   r   
startswith)r   r5   r   r   s       r   _calculate_remaining_lengthz*QueryGenerator._calculate_remaining_length  s    &&#-7#5#5#CDM  ((C/7#5#5#CDM  ((C/q!k/**((33C87#5#5#CDM   q!k/**r   r   c                    |dk(  rt        d|j                  |z
        }t        d|j                  |z
        }|dk(  r|dkD  ry|dk(  r|dkD  ry||k(  r|dkD  rd| dS ||k7  r"|dkD  r|t        d      k(  rd| dS d| d	| dS y
|dk(  rut        d|j                  |z
        }t        d|j                  |z
        }|dk(  r|dkD  ry||k(  r|dkD  rd| dS ||k7  r"|dkD  r|t        d      k(  rd| dS d| d	| dS y
|dk(  r|dk\  ry
y|j	                  d      rjt        d|j                  |z
        }t        d|j                  |z
        }||k(  r|dkD  rd| dS ||k7  r"|dkD  r|t        d      k(  rd| dS d| d	| dS y
|S )z$Adjust quantifier after enumeration.r   r   r   r   r   }rB   ,},rw   r   )rI   r   rC   rD   r   )r   r   r   r5   r   r   s         r   r   z!QueryGenerator._adjust_quantifier"  s   #%7#5#5#CDM7#5#5#CDM!ma&7!#(9-/MA4EM?"---/MA4E E%L0c22abAA C'7#5#5#CDM7#5#5#CDM!ma&7-/MA4EM?"---/MA4E E%L0c22abAA C'a ++C07#5#5#CDM7#5#5#CDM--!2CM?"---/MA4E E%L0c22abAA&&r   c                 |   	 d}|D ]  }t        |t              r|| j                  |j                        z  }3t        |t              r|| j                  |      z  }Xt        |t              rh| j                  |j                        }d}|j                  rd| d}nt        |dd      x}r	d| | d}nd| d}t        |dd      x}r||z  }||z  }t        |t              s| j                  |j                        }|d| dz  } |S # t        $ r"}	t        j                  d	|	        Y d}	~	yd}	~	ww xY w)
zFReconstruct regex pattern from segments preserving original structure.rw   ()r   Nz(?:r   z)?zPattern reconstruction failed: )r   r	   _preserve_escapesr~   r   _reconstruct_charclassr
   r    r   r   r   r2   r.   r3   )
r   r   r=   r5   group_content	group_strr   r   optional_contentr>   s
             r   r    z#QueryGenerator._reconstruct_pattern`  s]   !	F# 9g|4d44W__EEF)9:d99'BBF6$($=$=goo$NM "I((&'a$8	 /6g?PRV.WW?W*+O+<]O1(MI*-m_A(>I &-WlD%IIzI!Z/	i'F9'+'@'@'Q$$4#5R88F194 M 	NN<QC@A	s   C$D '(D 	D;D66D;r~   c                     |S )z4Preserve original escape sequences in fixed content.rs   )r   r~   s     r   r   z QueryGenerator._preserve_escapes  s	     r   charset_strc                 V    |s|S |j                  dd      }|j                  dd      }|S )z-Expand \w and \d shortcuts in charset string.z\wz
a-zA-Z0-9_z\dz0-9r   )r   r   r6   s      r   r$   z(QueryGenerator._expand_charset_shortcuts  s6     &&ul;##E51r   r   c                 
    d|v S )z0Detect if pattern has (?-i) case sensitive flag.z(?-i)rs   )r   r   s     r   _detect_case_sensitivityz'QueryGenerator._detect_case_sensitivity  s    '!!r   r!   c                 v   t               }|j                  d      r|j                  d      r|dd }d}|t        |      k  r ||   dk(  ru|dz   t        |      k  rd||dz      }|dk(  r|j	                  d       n?|dk(  r|j	                  d       n(|dk(  r|j	                  d       n|j	                  |       |dz  }n|dz   t        |      k  ri||dz      dk(  r^||dz      dk7  rS||   }||dz      }t        t        |      t        |      dz         D ]  }|j	                  t        |              |d	z  }n|j	                  ||          |dz  }|t        |      k  r |sTt               }	|D ]C  }
|
j                         r |	j	                  |
j                                3|	j	                  |
       E |	}|S )
zAParse charset string to character set, handling case sensitivity.r{   r|   r   r   rr   -r   r   )
r)   r   endswithr0   r   rH   ordchrisalphalower)r   r   r!   rk   rN   	next_char
start_charend_charcodenormalized_charschars              r   r&   z$QueryGenerator._parse_charset_to_set  s    !!#&;+?+?+D%a+K#k""1~%!a%#k2B*B'A.	#IIcN$&IIdO#%IIcNIIi(QQ[))k!a%.@C.GKXY\]X]L^beLe(^
&q1u-!#j/3x=13DE )DIIc$i()Q 		+a.)Q/ #k""4 "u /<<>$((6$((.	/
 %Er   c                    t        |dd      x}r|}n:t        t        |j                              }d}d}|t	        |      k  r||   }|dz   t	        |      k  rt        ||dz            t        |      dz   k(  rt        ||dz            t        |      dz   k(  r|dz   }|dz   t	        |      k  r]t        ||dz            t        ||         dz   k(  r:|dz  }|dz   t	        |      k  r$t        ||dz            t        ||         dz   k(  r:|| d||    z  }|dz   }|dv r	|d	| z  }n||z  }|dz  }|t	        |      k  rd
| d}t        |dd      x}	r|	}
n|j                  |j                  k(  r#|j                  dk(  rd}
nd|j                   d}
n|j                  t        d      k(  s|j                  dk\  r5|j                  dk(  rd}
nr|j                  dk(  rd}
n`d|j                   d}
nO|j                  t        d      k(  rd|j                   d}
n&d|j                   dt        |j                         d}
| |
 S )z@Reconstruct character class pattern preserving original escapes.r%   Nrw   r   r   r   r   z\-]^rr   r{   r|   r   r   r   rB      r   r   r   r   )
r   r]   r-   r"   r0   r   r   rC   rD   rF   )r   r5   r%   charset_partr"   class_contentrN   r   end_ir   r   s              r   r   z%QueryGenerator._reconstruct_charclass  s    $+74JD#QQQ/L T'//23GMAc'l"qz q53w<'71q5>*c$i!m;GAPQEN@SWZ[_W`cdWd@d !A#ai#g,63wuqy?Q;RVYZabgZhVilmVm;m!QJE $ai#g,63wuqy?Q;RVYZabgZhVilmVm;m &D675>2B)CC!AI  7?!r$[0M!T)MQ) c'l", }oQ/L #*'3H$"OOO,J !!W%7%77%%*!#J#%g&8&8%9!<J##uU|3w7I7IS7P%%*!$J''1,!$J#%g&8&8%9!=J%%u5#%g&8&8%9!=J#%g&8&8%93w?Q?Q;R:SSU!VJ
|,,r   N)r   )r   )F)r   r   )&__name__
__module____qualname____doc__rF   r   r   r   r   strr?   r   r'   rS   r(   rj   rm   boolrt   r   r   r   r+   r1   r   r   r   r   r   r   r   r    r   r$   r   r   r&   r   rs   r   r   r   r      s5   5## #:0g :0:M :0[^ :0hlmphq :0x'=M8N '`c 'hk 'R<"2 <s <"T*:%; "C "PS "X\]`Xa "H#3 C  PS I$s) I3 I4S	? I D S  S	 c '7  C TXY\T] *C C @ `bW/?HKY\	c@ Z_9W9045E0F9RV9	c9v# 6F # s # \`ad\e # Js s %0@ %S %N?W?/??MP?	g?B
' 
;K 
PT 
+3C +RU +Z] +*<'c <' <'Vf <'kn <'|#T'] #s #J  
S 
S 
" " "- -d -sSVx -^9-.> 9-3 9-r   r   )r   r   rV   typingr   r   tools.loggerr   r5   r   r   r	   r
   r   r   typesr   r.   r   rs   r   r   <module>r     s?       #  #	H	i-_ i-r   