
    =1h0S                     p    d Z ddlZddlZddlmZ ddlmZ ddlZ ej                  e	      Z
 G d d      Zy)z
Advanced Layout Structure Analyzer
---------------------------------
Detects actual website layout patterns, positioning, and hierarchical structure
for creating accurate Annotated Structure References (ASR).
    N)BeautifulSoup)urljoinc                       e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zy)LayoutAnalyzerz0Advanced layout structure detection and analysisc                 b    g dg dg dg dg dg dg dd| _         d	d
gdgdgddgd| _        y )N)headertopnavbarmastheadzsite-header)navmenu
navigationr
   zmain-nav)herobanner	jumbotronintrosplashlanding)maincontentprimaryzmain-content)sidebaraside	secondaryzwidget-area)footerbottomzsite-footerzpage-footer)sectionblockmodule	component)r   r   r   r   r   r   r   absolutefixedstickyrelativestaticinitial)r!   r#   r$   r%   )layout_patternspositioning_keywords)selfs    L/home/totalfocus.sw7ft.com/public_html/site_rebuilder/lib/layout_analyzer.py__init__zLayoutAnalyzer.__init__   sG    LMQBGHB 
 $W-j#+	%
!    c                 
   t        |d      }| j                  ||      }| j                  |      }g }|D ])  }| j                  |||      }|s|j	                  |       + | j                  ||      }	| j                  |	      }
|
S )zH
        Main method to analyze layout structure and create ASR
        zhtml.parser)r   _extract_css_rules_find_structural_elements_analyze_elementappend_build_layout_hierarchy_generate_asr)r)   html_contenturlsoup	css_rulesstructural_elementsanalyzed_elementselementanalysislayout_treeasrs              r*   analyze_layout_structurez'LayoutAnalyzer.analyze_layout_structure&   s     \=9 ++D#6	 #<<TB * 	3G,,WiFH!((2	3 223DdK   -
r,   c                    i }|j                  d       D ]b  }|j                  dd      }|j                  dg       }|j                  dd      }| j                  |      }|r||d| <   |D ]
  }	||d|	 <    d |j                  d      D ]3  }
|
j                  }|s| j	                  |      }|j                  |       5 |S )z8Extract CSS rules from style tags and linked stylesheetsc                 $    | j                  d      S )Nstylehas_attrtags    r*   <lambda>z3LayoutAnalyzer._extract_css_rules.<locals>.<lambda>F   s    g1F r,   id classrA   #.)find_allget_parse_css_propertiesstring_parse_css_contentupdate)r)   r6   base_urlr7   r:   
element_idelement_classesinline_stylecss_propertiescls	style_tagcss_contentparsed_ruless                r*   r.   z!LayoutAnalyzer._extract_css_rulesA   s    	 }}%FG 	6G T2.J%kk'26O";;w3L!77EN .<	Aj\*+& 6'5	AcU)$6	6 w/ 	/I#**K#66{C  .		/ r,   c                     i }|j                  d      D ]K  }d|v s|j                  dd      \  }}|j                         ||j                         j                         <   M |S )z(Parse CSS properties from a style string;:   )splitstriplower)r)   
css_string
propertiespropkeyvalues         r*   rN   z$LayoutAnalyzer._parse_css_properties\   sg    
 $$S) 	@Dd{!ZZQ/
U27++-
399;,,./	@
 r,   c                    i }t        j                  d      }|j                  |      D ]V  }|j                  d      j	                         }|j                  d      j	                         }| j                  |      }|||<   X |S )z"Parse CSS content to extract rulesz([^{]+)\{([^}]+)\}r^      )recompilefinditergroupr`   rN   )r)   rY   rulescss_rule_patternmatchselectorproperties_stringrc   s           r*   rP   z!LayoutAnalyzer._parse_css_contenth   s     ::&;<%..{; 	)E{{1~++-H %A 4 4 6334EFJ(E(O	) r,   c                    g }g d}|D ]/  }|j                  |      }|D ]  }|j                  ||dd        1 |j                  g d      D ]1  }| j                  |      }|s|dk7  s|j                  ||dd       3 |j                  d       D ]@  }| j                  |j	                  d            }|s&|dk7  s,|j                  ||d	d       B | j                  |      S )
z*Find major structural elements in the HTMLr   r   r   r   articler   r   semantic_html)r:   typedetection_method)divr   r   r   unknownclass_analysisc                 $    | j                  d      S )NrG   rB   rD   s    r*   rF   z:LayoutAnalyzer._find_structural_elements.<locals>.<lambda>   s    d1C r,   rG   id_analysis)rL   r1   _classify_by_attributes_classify_by_idrM   _deduplicate_elements)r)   r6   r8   semantic_tagsrE   elementsr:   element_types           r*   r/   z(LayoutAnalyzer._find_structural_elementsx   s     [  	C}}S)H# #**&(7, 	 }}%KL 	G77@L	 9#**&((8, 	 }}%CD 	G//D0ABL	 9#**&((5, 	 ))*=>>r,   c                     |j                  dg       }dj                  |      j                         }| j                  j	                         D ]  \  }}|D ]  }||v s|c c S   y)z*Classify element type based on class namesrI    ry   )rM   joinra   r'   items)r)   r:   classesclass_stringlayout_typekeywordskeywords          r*   r}   z&LayoutAnalyzer._classify_by_attributes   sn    ++gr*xx(..0%)%9%9%?%?%A 	'!K# 'l*&&'	'
 r,   c                     |j                         }| j                  j                         D ]  \  }}|D ]  }||v s|c c S   y)z!Classify element type based on IDry   )ra   r'   r   )r)   rS   r   r   r   s        r*   r~   zLayoutAnalyzer._classify_by_id   sU    %%'
%)%9%9%?%?%A 	'!K# 'j(&&'	'
 r,   c                     t               }g }|D ]9  }|d   }t        |      }||vs|j                  |       |j                  |       ; |S )z'Remove duplicate elements from the listr:   )setrG   addr1   )r)   r8   seen_elementsunique_elementsitemr:   rS   s          r*   r   z$LayoutAnalyzer._deduplicate_elements   s[    ' 	-D9oGGJ.!!*-&&t,	- r,   c                 .   |d   }|d   }||j                   |d   |j                  dd      |j                  dg       d| j                  ||      | j                  ||      | j	                  |      | j                  |      | j                  ||      d	}|S )	z@Analyze individual element for positioning, styling, and contentr:   rv   rw   rG   rH   rI   )rG   r   )	rv   rE   rw   
attributespositioningstylingr   childrenrelationships)namerM   _analyze_positioning_analyze_styling_analyze_content_find_child_elements_analyze_relationships)r)   element_datar7   r6   r:   r   r;   s          r*   r0   zLayoutAnalyzer._analyze_element   s    y)#F+ !<< ,-? @kk$+";;w3  44WiH,,Wi@,,W511':!88$G
 r,   c                 "   ddddddddddd
}|j                  dd      }|r1| j                  |      }|j                  | j                  |             |j                  dd      }|j                  dg       }|r/d	| |v r(|d	|    }|j                  | j                  |             |D ]2  }	d
|	 |v s|d
|	    }|j                  | j                  |             4 |d   dk(  |d<   |d   dk(  |d<   |d   dk(  |d<   |d   xs |d   |d<   |S )z&Analyze element positioning propertiesr%   NF)
positionr	   rightr   leftz_indexis_fixedis_absolute	is_sticky
is_overlayrA   rH   rG   rI   rJ   rK   r   r"   r   r!   r   r#   r   r   )rM   rN   rQ   _extract_position_props)
r)   r:   r7   r   rU   inline_propsrS   rT   	css_propsrW   s
             r*   r   z#LayoutAnalyzer._analyze_positioning   sm    ! 
 {{7B/55lCLt;;LIJ [[r*
!++gr2 Aj\*i7!Aj\"23It;;IFG # 	LC3%yI%%#i0	""4#?#?	#JK	L #.j"9W"DJ%0%<
%JM"#.z#:h#FK $/$>$Y+jBYL!r,   c                 j    i }d|v r|d   |d<   dD ]  }||v s||   ||j                  dd      <   ! |S )z7Extract positioning properties from CSS properties dictr   )r	   r   r   r   zz-index-_)replace)r)   r   position_propsrd   s       r*   r   z&LayoutAnalyzer._extract_position_props  sX    ")2:)>N:&A 	IDy 9B4t||C56	I r,   c           
      >   dddddddddd	}|j                  dd      }|r1| j                  |      }|j                  | j                  |             |j                  dd      }|j                  dg       }|r/d| |v r(|d|    }|j                  | j                  |             |D ]2  }	d	|	 |v s|d	|	    }|j                  | j                  |             4 t	        |d
         |d<   | j                  |d         |d<   | j                  |d         |d<   |S )z"Analyze element styling propertiesNF)	background_imagebackground_colorheightwidthis_full_heightis_full_widthhas_background_imageopacitydisplayrA   rH   rG   rI   rJ   rK   r   r   r   r   r   r   )rM   rN   rQ   _extract_style_propsbool_is_full_height_is_full_width)
r)   r:   r7   r   rU   r   rS   rT   r   rW   s
             r*   r   zLayoutAnalyzer._analyze_styling  sU    !% $#"$)

 {{7B/55lCLNN444\BC [[r*
!++gr2Aj\*i7!Aj\"23INN444Y?@" 	EC3%yI%%#i0	t88CD	E +/w7I/J*K&'$($8$89J$K !#'#6#6ww7G#H r,   c                 f    i }ddddddd}|j                         D ]  \  }}||v s||   ||<    |S )z3Extract styling properties from CSS properties dictr   r   r   r   r   r   )zbackground-imagezbackground-colorr   r   r   r   )r   )r)   r   style_propsstyle_mappingscss_prop	style_keys         r*   r   z#LayoutAnalyzer._extract_style_props<  s`     !3 2  
 $2#7#7#9 	=Hi9$)28)<I&	= r,   c                 R    syj                         t        fddD              S )z)Determine if height indicates full heightFc              3   &   K   | ]  }|v  
 y wN ).0r   height_values     r*   	<genexpr>z1LayoutAnalyzer._is_full_height.<locals>.<genexpr>U  s     Tw7l*T   )100vh100%fullra   any)r)   r   s    `r*   r   zLayoutAnalyzer._is_full_heightO  s)    #))+T:STTTr,   c                 R    syj                         t        fddD              S )z'Determine if width indicates full widthFc              3   &   K   | ]  }|v  
 y wr   r   )r   r   width_values     r*   r   z0LayoutAnalyzer._is_full_width.<locals>.<genexpr>]  s     Sg7k)Sr   )100vwr   r   r   )r)   r   s    `r*   r   zLayoutAnalyzer._is_full_widthW  s)    !'')S9RSSSr,   c                    t        |j                  d            dkD  r|j                  d      dd dz   n|j                  d      t        |j                  d            dkD  t        |j                  d            dkD  t        |j                  d	            dkD  g g d
}|j                  dk(  sdt	        |j                  dg             v rY|j                  d      }|D cg c]9  }|j                  d      r%|j                  d      |j                  dd      d; c}|d<   |j                  d      }|D cg c]'  }|j                  dd      |j                  dd      d) c}|d<   |S c c}w c c}w )zAnalyze element contentT)r`      Nz...imgr   aform)text_content
has_images	has_links	has_formsnavigation_linksimagesr   rI   hrefrH   )textr   r   srcalt)r   r   r   )lenget_textrL   r   strrM   )r)   r:   r   linkslinkr   r   s          r*   r   zLayoutAnalyzer._analyze_content_  s    KNgN^N^eiN^NjJknqJqG,,4,8#>Fw~  xH  xH  OS  xH  xTg..u56:W--c23a7W--f56: "
 <<5 ESWb1I-J$J$$S)E "+
 T]]]%> !MMM5 HHVR0+G&' !!%( 

  wwub)wwub)
 %+
s   ">E/;,E4c           
         g }|j                  g dd      D ]d  }| j                  |      }|dk(  r|j                  }|j                  ||j                  |j	                  dd      |j	                  dg       d       f |S )	z%Find direct child structural elementsrs   F)	recursivery   rG   rH   rI   )rv   rE   rG   r   )rL   r}   r   r1   rM   )r)   r:   r   child
child_types        r*   r   z#LayoutAnalyzer._find_child_elements  s    %%&hty%z 
	E55e<JY&"ZZ
OO"zziib) 99Wb1	 
	 r,   c                     dg g | j                  ||      d}|j                  g d      }|rB|j                  dk7  r3|j                  |j                  dd      |j                  dg       d|d	<   |S )
z7Analyze element relationships (overlaps, nesting, etc.)N)parentoverlaps_withcontainsposition_in_flow)r   r   r   r   rt   r   r   bodyr   rG   rH   rI   )rv   rG   r   r   )_get_position_in_flowfind_parentr   rM   )r)   r:   r6   r   r   s        r*   r   z%LayoutAnalyzer._analyze_relationships  s|      $ : :7D I	
 $$%opfkkV+jjr*!::gr2'M(# r,   c                     |j                   }|sy|j                  g d      }	 |j                  |      S # t        $ r Y yw xY w)z+Determine element position in document flowr   rs   )r   rL   index
ValueError)r)   r:   r6   r   all_elementss        r*   r   z$LayoutAnalyzer._get_position_in_flow  sH    yy}}%gh	%%g.. 		s   5 	A Ac                    |j                  d        g i d}|D ]d  }|d   d   rF|d   d   }|d   r|d    d|d    n|d   }||d	   vrg |d	   |<   |d	   |   j                  |       Q|d
   j                  |       f |S )z#Build hierarchical layout structurec                     | d   d   S )Nr   r   r   )xs    r*   rF   z8LayoutAnalyzer._build_layout_hierarchy.<locals>.<lambda>  s    Q-?@R-S r,   )re   )rootnestedr   r   rG   rv   rJ   r   r   )sortr1   )r)   r9   r6   r<   r:   parent_info
parent_keys          r*   r2   z&LayoutAnalyzer._build_layout_hierarchy  s     	#ST 

 ) 
	4G'1%o6x@MXY]M^F 34Ak$6G5HIdopvdw
[%::8:K)*5H%j188AF#**73
	4 r,   c                 b   g dddddg d}|d   D ]  }| j                  ||      }|d   j                  |       |d   j                  |       |d	   d
xx   dz  cc<   |d   dk(  r|d   d   rd|d	   d<   |d   dk(  sn| j                  |      |d	   d<    | j                  |      |d	   d<   |S )z&Generate Annotated Structure Referencer   Fry   )total_sectionshas_overlay_headernavigation_styler   )template_structurelayout_analysisdetailed_sectionsr   r  r  r  r  r^   rv   r   r   r   Tr  r   r  r   )_create_section_descriptionr1   _determine_nav_style_determine_layout_type)r)   r<   r=   r:   section_descriptions        r*   r3   zLayoutAnalyzer._generate_asr  s    #%"#&+$-(	  "$	
 #6* 	`G"&"B"B7K"X$%,,-@A#$++G4!"#3494 v(*w}/El/S?C%&';<v,.=A=V=VW^=_%&'9:	` 150K0KK0X}-
r,   c                    |d   j                         }g }|d   d   r9|d   d   dk(  r|j                  d       n|d   d   dk(  r|j                  d       |d   d	   r|j                  d
       |d   d   r|j                  d       |d   d   r|j                  d       |d   d   r|j                  d       |d   dk(  r&t        d |d   D              r|j                  d       |d   dk(  r6|d   d   r.|d   D ]&  }|d   dk(  s|d   d   s|j                  d        n |d   d   r%t        |d   d         dk  r|j                  d       |r| ddj	                  |       dS |S )z)Create human-readable section descriptionrv   r   r   r   r!   zAbsolute Positioningr"   zFixed Positioningr   Stickyr   r   zBackground Imager   zFull Heightr   z
Full Widthr   c              3   ,   K   | ]  }|d    dk(    yw)rv   r   Nr   )r   r   s     r*   r   z=LayoutAnalyzer._create_section_description.<locals>.<genexpr>  s      /
.3E&M\)/
s   r   zContains Navigationr   r   zOverlaps Heror   g      ?Transparentz (z, ))titler1   r   floatr   )r)   r:   r<   section_typeannotationsother_elements         r*   r	  z*LayoutAnalyzer._create_section_description  s   v,,. =!,/}%j1Z?""#9:'
3w>""#67=!+.x( 945129./}-9o.|, 6?h&3 /
7>z7J/
 ,
 45 6?h&7=+A,+O!,V!4  (F2}Y7OP`7a&&7 9i(U793Ei3P-QTW-W}- "^2dii&<%=Q??r,   c                 P    |d   d   }t        |      dk  ryt        |      dk  ryy)zDetermine navigation styler   r      minimal   standard	extensiver   )r)   nav_element	nav_linkss      r*   r
  z#LayoutAnalyzer._determine_nav_style  s3    	*+=>	y>Q^q r,   c                     |d   }|D cg c]  }|d   	 }}d|v r	d|v rd|v ryd|v ryt        |D cg c]  }|d   d	k(  s| c}      d
kD  ryyc c}w c c}w )zDetermine overall layout typer   rv   r   r   r   landing_pager   blog_layoutr   r  multi_sectionsimple_layoutr  )r)   r<   root_elementselemelement_typeses         r*   r  z%LayoutAnalyzer._determine_layout_type)  s}    #F+2?@$f@@}$=)@XQ^E^!-' ]Eai9.D!EFJ"" A Fs   AAAN)__name__
__module____qualname____doc__r+   r>   r.   rN   rP   r/   r}   r~   r   r0   r   r   r   r   r   r   r   r   r   r   r2   r3   r	  r
  r  r   r,   r*   r   r      s    :
$66
 $?L
	,*X'R&UT D$(2@/ b	#r,   r   )r,  ri   jsonbs4r   urllib.parser   logging	getLoggerr)  loggerr   r   r,   r*   <module>r3     s9    
     			8	$d# d#r,   