
    :jbP                     V   d Z ddlZddlZddlmZmZmZ ddlmZm	Z	m
Z
mZmZmZ ddlmZmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZ ddlmZ dd	lm Z  dd
l!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0  e$d      Z1 e0ejd                  jf                  g ejh                  jf                  ejj                  jf                  gd       G d de)             Z6 e0ejh                  jf                  ejd                  jf                  gejj                  jf                  gd       G d de)             Z7 e0ejj                  jf                  g ejp                  jf                  gd       G d de)             Z9 e0ejp                  jf                  g g d       G d de)             Z:y)zu
Built-in stage definitions for the pipeline system.
Registers all standard pipeline stages with their dependencies.
    N)ListOptionalTuple)	API_LIMITAPI_MAX_PAGESAPI_RESULTS_PER_PAGE	WEB_LIMITWEB_MAX_PAGESWEB_RESULTS_PER_PAGE)SERVICE_TYPE_GITHUB_APISERVICE_TYPE_GITHUB_WEB)ErrorReasonPipelineStage
ResultType)AcquisitionTask	CheckTaskInspectTaskPatternsProviderTask
SearchTaskService)	IProvider)RefineEngine)client)
get_logger)get_service_namehandle_exceptions   )BasePipelineStageOutputHandlerStageOutputStageResources)TaskFactory)register_stagestagez$Search GitHub for potential API keys)name
depends_onproduces_fordescriptionc            	       X    e Zd ZdZdedef fdZdedefdZ	dede
fdZdede
fd	Zdedee   fd
Zdedee   fdZdedeee   eef   fdZdede
defdZdedeee   ef   fdZde
de
fdZdedededdfdZdedededee   fdZ eg d      dededee   fd       Z xZS )SearchStagezCPipeline stage for searching GitHub with pure functional processing	resourceshandlerc                 Z    t        |   t        j                  j                  ||fi | y N)super__init__r   SEARCHvalueselfr,   r-   kwargs	__class__s       7/root/.openclaw/workspace/harvester/stage/definition.pyr1   zSearchStage.__init__5   %    --33YR6R    taskreturnc           	          t        |t              r|n	t               }t        j                  j                   d|j
                   d|j                   d|j                   d|j                   	S 1Generate unique task identifier for deduplication:)	
isinstancer   r   r2   r3   providerquerypageregexr5   r;   search_tasks      r8   _generate_idzSearchStage._generate_id8   sb    (z:d
##))*!DMM?!K<M<M;NaP[P`P`Oaabcnctctbuv	
r:   c                 "    t        |t              S )z#Validate that task is a SearchTask.)rA   r   r5   r;   s     r8   _validate_task_typezSearchStage._validate_task_type?   s    $
++r:   c                 T   | j                   j                  |j                  d      s0t        j	                  d| j
                   d|j                          yt        |t              r|nd}|r|j                  s0t        j                  d| j
                   d|j                          yy)z6Pre-process search task - validate query and provider.search[z ] search disabled for provider: FNz] empty query for provider: T)
r,   
is_enabledrB   loggerdebugr&   rA   r   rC   warningrF   s      r8   _pre_processzSearchStage._pre_processC   s     ~~((ALL1TYYK'GWX )z:d+"3"3NNQtyyk)Edmm_UVr:   c                 $    | j                  |      S )zExecute search task processing.)_search_workerrJ   s     r8   _execute_taskzSearchStage._execute_taskS   s    ""4((r:   c                    	 |j                   dk(  r| j                  |      \  }}}n| j                  |      \  }}d}t        |      }g }|r|j                  r| j                  ||      }|D ]L  }t        j                  |j                  |      }|j                  |t        j                  j                         N |r;t        j                  d| j                   dt!        |       d|j                          |rt#        |j                  |j$                  |j&                  |j(                        }	|D ]M  }
t        j*                  |j                  |
|	      }|j                  |t        j,                  j                         O |j/                  |j                  |       |j                   dk(  r|dkD  r| j1                  |||       t        j                  d| j                   d|j                   d	|rt!        |      nd d
t!        |       d	       |S # t2        $ r?}t        j5                  d| j                   d|j                   d| d|        Y d}~yd}~ww xY w)zPure functional search workerr   r   r;   rN   z] extracted z% keys from search content, provider: )key_patternaddress_patternendpoint_patternmodel_patternz] search completed for z: z links, z keysz] error, provider: , task: , message: N)rD   _execute_first_page_search_execute_page_searchr!   rE   _extract_keys_from_contentr#   create_check_taskrB   add_taskr   CHECKr3   rP   infor&   lenr   rZ   r[   r\   create_acquisition_taskGATHER	add_links_handle_first_page_results	Exceptionerror)r5   r;   resultscontenttotaloutputkeyskey_service
check_taskpatternslinkacquisition_taskes                r8   rU   zSearchStage._search_workerW   s4   3	yyA~*.*I*I$*O'%#'#<#<T#B  !d+F D4::66wE#' KK!,!>!>t}}k!ZJOOJ0C0C0I0IJK KKDII;l3t9+=bcgcpcpbqr
 # $

$($8$8%)%:%:"&"4"4	 $ RD'2'J'J4==Z^`h'i$OO$4m6J6J6P6PQR
   8 yyA~%!)//eVDKKDII;5dmm_BW^s7|deFffnorswoxnyy~ M 	LL1TYYK':4==/RVQWWbcdbefg	s   HH 	I5IIc                    | j                  |j                         |j                  r%| j                  j                  j	                         }n$| j                  j                  j                         }t        j                  | j                  |j                  |j                        ||j                  |j                  |j                  rt        nt              \  }}}|||fS )z?Execute first page search and get total count in single requestrC   sessionrD   with_api	peer_page)_apply_rate_limituse_apir,   auth	get_tokenget_sessionr   search_with_count_preprocess_queryrC   rD   r   r   )r5   r;   
auth_tokenrm   ro   rn   s         r8   r_   z&SearchStage._execute_first_page_search   s     	t||, <<,,668J,,88:J #)":":((T\\B\\.2ll*@T#
 &&r:   rC   r~   c                 Z    |r(t        j                         j                  |      }|r|}|S )zQGithub Rest API search syntax don't support regex, so we need remove it if exists)rC   )r   get_instanceclean_regex)r5   rC   r~   keywords       r8   r   zSearchStage._preprocess_query   s.    "//1==E=JGr:   c                    | j                  |j                         |j                  r%| j                  j                  j	                         }n$| j                  j                  j                         }t        j                  | j                  |j                  |j                        ||j                  |j                  |j                  rt        nt              \  }}||fS )z0Execute subsequent page search in single requestry   )r}   r~   r,   r   r   r   r   search_coder   rC   rD   r   r   )r5   r;   r   rm   rn   s        r8   r`   z SearchStage._execute_page_search   s     	t||, <<,,668J,,88:J "--((T\\B\\.2ll*@T
 r:   c                    |rt         nt        }| j                  j                  j	                  |      s| j                  j                  j                  |      }|dkD  rt        j                  |       | j                  j                  j	                  |      sb| j                  j                  j                  |      }|r|j                  nd}t        j                  d| j                   d|rdnd d|        yy	)
z'Apply rate limiting for GitHub requestsr   unknownrN   z!] rate limit exceeded for Github zRest APIWeb, max: FT)r   r   r,   limiteracquire	wait_timetimesleep_get_bucketburstrP   re   r&   )r5   r~   service_typer   bucket	max_values         r8   r}   zSearchStage._apply_rate_limit   s    29.?V~~%%--l;..88FI1}

9%~~--55lC!^^33??MF06IIKKDII;&GV]
chGiipqzp{| !r:   ro   rp   Nc                    |j                   rt        nt        }|j                   rt        nt        }||kD  rt        t        j                  ||z              }t        j                         j                  |j                  |      }|D ]  }|s=t        j                  d| j                   d|j                   d|j                          B||j                  k(  r3t        j                  d| j                   d| d|j                          t!        |j                  ||j"                  d|j                   |j$                  |j&                  |j(                        }	|j+                  |	t,        j.                  j0                          t        j3                  d| j                   dt5        |       d	|j                   d
|j                          y||kD  r| j7                  |||      }
|
D ],  }|j+                  |t,        j.                  j0                         . t        j3                  d| j                   dt5        |
       d|j                   d
|j                          yy)z;Handle first page results - decide pagination or refinement)rC   
partitionsrN   z-] skip refined query due to empty for query: z, provider: z*] discard refined query same as original: r   rB   rC   rE   rD   r~   rZ   r[   r\   z] generated z refined tasks for provider: z	, query: z page tasks for provider: N)r~   r   r	   r   r   intmathceilr   r   generate_queriesrC   rP   rR   r&   rB   r   rE   rZ   r[   r\   rc   r   r2   r3   re   rf   _generate_page_tasks)r5   r;   ro   rp   limitper_pager   queriesrC   refined_task
page_tasks	page_tasks               r8   rj   z&SearchStage._handle_first_page_results   s(   !\\	y+/<<'=Q 5=TYYuu}56J"//1BB`jBkG ! JNNDII;&STXT^T^S__klplylykz{ djj(NNDII;&PQVPWWcdhdqdqcrs )!]]** LL$($8$8%)%:%:"&"4"4	  m.B.B.H.HI/J2 KKDII;l3w<.8UVZVcVcUddmnrnxnxmyz
 X224IJ' G		=+?+?+E+EFGKKDII;l3z?*;;UVZVcVcUddmnrnxnxmyz	 r:   r   c                 v   t        t        j                  ||z        |j                  rt        nt
              }g }t        d|dz         D ]l  }t        |j                  |j                  |j                  ||j                  |j                  |j                  |j                        }|j                  |       n |S )zGenerate pagination tasks   r   r   )minr   r   r~   r   r
   ranger   rB   rC   rE   rZ   r[   r\   append)r5   r;   ro   r   	max_pagesr   rD   r   s           r8   r   z SearchStage._generate_page_tasks  s     IIeh&'!\\M}
	
 (*
!Y]+ 	)D"jjjj $ 4 4!%!6!6"00	I i(	) r:   rl   )default_result	log_levelrn   c                     t        j                  |j                  |j                  |j                  |j
                  |      }|S )z)Extract keys directly from search content)rY   rZ   r[   r\   text)r   collectrE   rZ   r[   r\   )r5   rn   r;   servicess       r8   ra   z&SearchStage._extract_keys_from_content  s>     >>

 00!22,,
 r:   )__name__
__module____qualname____doc__r"   r    r1   r   strrH   boolrK   rS   r   r!   rV   r   rU   r   r   r   r_   r   r`   r}   rj   r   r   r   ra   __classcell__r7   s   @r8   r+   r+   ,   sz    NS. S= S
 
# 
, , , $  ), )8K3H )5: 5(;2G 5n'z 'eDIsTWDW>X ',s T c    d3in8M  , $  0z 0# 0{ 0_c 0d C 3 SWXbSc 0 bG<
# 
Z 
DQXM 
 =
r:   r+   z Gather keys from discovered URLsc                   x     e Zd ZdZdedef fdZdedefdZ	dede
fdZdedee   fd	Zdedee   fd
Z xZS )AcquisitionStagezKPipeline stage for acquiring keys from URLs with pure functional processingr,   r-   c                 Z    t        |   t        j                  j                  ||fi | y r/   )r0   r1   r   rh   r3   r4   s       r8   r1   zAcquisitionStage.__init__4  r9   r:   r;   r<   c                     t        |t              r|n	t               }t        j                  j                   d|j
                   d|j                   S r>   )rA   r   r   rh   r3   rB   url)r5   r;   rv   s      r8   rH   zAcquisitionStage._generate_id7  sF    #-dO#D4/J[&&,,-Qt}}oQ?O?S?S>TUUr:   c                 "    t        |t              S )z)Validate that task is an AcquisitionTask.)rA   r   rJ   s     r8   rK   z$AcquisitionStage._validate_task_type<  s    $00r:   c                 $    | j                  |      S )z$Execute acquisition task processing.)_acquisition_workerrJ   s     r8   rV   zAcquisitionStage._execute_task@  s    ''--r:   c                    	 t        j                  |j                  |j                  |j                  |j
                  |j                  |j                        }t        |      }|r|D ]L  }t        j                  |j                  |      }|j                  |t        j                  j                         N |j!                  |j                  t"        j$                  j                  |       |j'                  |j                  |j                  g       |S # t(        $ r?}t*        j-                  d| j.                   d|j                   d| d|        Y d}~yd}~ww xY w)z1Pure functional acquisition worker implementation)rY   r   retriesrZ   r[   r\   rX   rN   ] error for provider: r]   r^   N)r   r   rY   r   r   rZ   r[   r\   r!   r#   rb   rB   rc   r   rd   r3   
add_resultr   MATERIALri   rk   rP   rl   r&   )r5   r;   r   rp   servicers   rw   s          r8   r   z$AcquisitionStage._acquisition_workerD  s$   	~~ ,,HH $ 4 4!%!6!6"00H !d+F ' KG!,!>!>t}}g!VJOOJ0C0C0I0IJK
 !!$--1D1D1J1JHU T]]TXXJ7M 	LL1TYYK'=dmm_HUYTZZefgehij	s   DD 	E5EE)r   r   r   r   r"   r    r1   r   r   rH   r   rK   r   r!   rV   r   r   r   r   s   @r8   r   r   +  sx     VS. S= SV V# V
1 1 1., .8K3H .   H[<Q  r:   r   zValidate API keysc                   x     e Zd ZdZdedef fdZdedefdZ	dede
fdZdedee   fd	Zdedee   fd
Z xZS )
CheckStagezFPipeline stage for validating API keys with pure functional processingr,   r-   c                 Z    t        |   t        j                  j                  ||fi | y r/   )r0   r1   r   rd   r3   r4   s       r8   r1   zCheckStage.__init__p  s%    ,,22IwQ&Qr:   r;   r<   c           	      R   t        |t              r|nd}|rh|j                  r\|j                  }t        j                  j
                   d|j                   d|j                   d|j                   d|j                   	S t        j                  j
                   d|j                   dS r?   Nr@   z:unknown)
rA   r   r   r   rd   r3   rB   keyaddressendpoint)r5   r;   rs   r   s       r8   rH   zCheckStage._generate_ids  s    'i8Td
*,, ((G#))//0$--'++aPWP_P_O``abibrbrastt%%++,Admm_HEEr:   c                 "    t        |t              S )z"Validate that task is a CheckTask.)rA   r   rJ   s     r8   rK   zCheckStage._validate_task_type|  s    $	**r:   c                 $    | j                  |      S )zExecute check task processing.)_check_workerrJ   s     r8   rV   zCheckStage._execute_task  s    !!$''r:   c                    	 | j                   j                  j                  |j                        }|rt	        |t
              s<t        j                  d| j                   d|j                   dt        |              yt        |j                        }| j                   j                  j                  |      s| j                   j                  j                  |      }|dkD  rt        j                  |       | j                   j                  j                  |      sh| j                   j                  j!                  |      }|r|j"                  nd}t        j%                  d| j                   d|j                   d|        y|j'                  |j(                  j*                  |j,                  xs |j(                  j.                  |j(                  j0                  |j(                  j2                  	      }| j                   j                  j5                  |d
       t7        |      }|j8                  rt;        j<                  |j                  |j(                        }	|j?                  |	t@        jB                  jD                         |jG                  |j                  tH        jJ                  jD                  |j(                  g       |S |jL                  tN        jP                  k(  rB|jG                  |j                  tH        jP                  jD                  |j(                  g       |S |jL                  tN        jR                  tN        jT                  tN        jV                  fv rB|jG                  |j                  tH        jX                  jD                  |j(                  g       |S |jG                  |j                  tH        jZ                  jD                  |j(                  g       |S # t\        $ rx}
| j                   j                  j5                  t        |j                        d       t        j                  d| j                   d|j                   d| d|
        Y d}
~
yd}
~
ww xY w)z+Pure functional check worker implementationrN   ] unknown provider: , type: Nr   r   z$] rate limit exceeded for provider: r   )tokenr   r   modelTrX   Fr   r]   r^   )/r,   	providersgetrB   rA   r   rP   rl   r&   typer   r   r   r   r   r   r   r   re   checkr   r   
custom_urlr   r   r   report_resultr!   	availabler#   create_inspect_taskrc   r   INSPECTr3   r   r   VALIDreasonr   NO_QUOTARATE_LIMITEDNO_MODEL	NO_ACCESS
WAIT_CHECKINVALIDrk   )r5   r;   rB   r   r   r   r   resultrp   inspect_taskrw   s              r8   r   zCheckStage._check_worker  s]   B	~~//33DMMBH:h	#Bq+?hW[\dWeVfgh ,DMM:L>>))11,? NN22<<\J	q=JJy)>>1199,G!%!7!7!C!CL!Q4:FLL				{*Nt}}o]dendop  $ ^^ll&&?4<<+?+?..ll((	 $ F NN""00tD !d+F *>>t}}dll[m.C.C.I.IJ !!$--1A1A1G1G$,,X" M ==K$8$88%%dmmZ5H5H5N5NQUQ]Q]P^_ M ]],,(())' 
 %%dmmZ5J5J5P5PSWS_S_R`a
 M %%dmmZ5G5G5M5MPTP\P\~^M 	NN""001A$--1PRWXLL1TYYK'=dmm_HUYTZZefgehij	s<   A<O* ?DO* EO* AO* +A<O* (AO* *	Q+3A.Q&&Q+)r   r   r   r   r"   r    r1   r   r   rH   r   rK   r   r!   rV   r   r   r   r   s   @r8   r   r   g  s{     QR. R= RF F# F+ + +(, (8K3H (D) D0E Dr:   r   z+Inspect API capabilities for validated keysc                   x     e Zd ZdZdedef fdZdedefdZ	dede
fdZdedee   fd	Zdedee   fd
Z xZS )InspectStagezNPipeline stage for inspecting API capabilities with pure functional processingr,   r-   c                 Z    t        |   t        j                  j                  ||fi | y r/   )r0   r1   r   r   r3   r4   s       r8   r1   zInspectStage.__init__  s%    ..44iSFSr:   r;   r<   c                 8   t        |t              r|nd}|r[|j                  rO|j                  }t        j                  j
                   d|j                   d|j                   d|j                   S t        j                  j
                   d|j                   dS r   )	rA   r   r   r   r   r3   rB   r   r   )r5   r;   r   r   s       r8   rH   zInspectStage._generate_id  s    )$<t$L00"**G#++112!DMM?!GKK=PQRYRaRaQbcc''--.ahGGr:   c                 "    t        |t              S )z%Validate that task is an InspectTask.)rA   r   rJ   s     r8   rK   z InspectStage._validate_task_type  s    $,,r:   c                 $    | j                  |      S )z Execute inspect task processing.)_inspect_workerrJ   s     r8   rV   zInspectStage._execute_task  s    ##D))r:   c                    	 | j                   j                  j                  |j                        }|rt	        |t
              s<t        j                  d| j                   d|j                   dt        |              y|j                  |j                  j                  |j                  j                  |j                  j                        }t        |      }|r1|j!                  |j                  |j                  j                  |       |S # t"        $ r?}t        j                  d| j                   d|j                   d| d	|        Y d}~yd}~ww xY w)
z-Pure functional inspect worker implementationrN   r   r   N)r   r   r   rX   z"] inspect models error, provider: r]   r^   )r,   r   r   rB   rA   r   rP   rl   r&   r   inspectr   r   r   r   r!   
add_modelsrk   )r5   r;   rB   modelsrp   rw   s         r8   r   zInspectStage._inspect_worker  s   	~~//33DMMBH:h	#Bq+?hW[\dWeVfgh %%ll&&0D0Dt||OdOd & F
 !d+F !!$--1A1A6JM 	LL1TYYK'I$--X`ae`ffqrsqtuv	s   A<D ?BD 	E5EE)r   r   r   r   r"   r    r1   r   r   rH   r   rK   r   r!   rV   r   r   r   r   s   @r8   r   r     sx     YT. T= TH H# H- - -*, *8K3H *K H[4I r:   r   );r   r   r   typingr   r   r   constant.searchr   r   r   r	   r
   r   constant.systemr   r   
core.enumsr   r   r   core.modelsr   r   r   r   r   r   r   
core.typesr   refine.enginer   rM   r   tools.loggerr   tools.utilsr   r   baser   r    r!   r"   factoryr#   registryr$   rP   r2   r3   rh   rd   r+   r   r   r   r    r:   r8   <module>r     s  
   ( (  M = =   ! &  # ; O O   $	G	 				#	#&&,,m.A.A.G.GH6	v# vvr 				#	#$$**+%%++,2	3( 33l 				"	"''--.#	[" [[| 				$	$=	0$ 00r:   