// // File: hzDocHtml.cpp // // Legal Notice: This file is part of the HadronZoo C++ Class Library. Copyright 2025 HadronZoo Project (http://www.hadronzoo.com) // // The HadronZoo C++ Class Library is free software: You can redistribute it, and/or modify it under the terms of the GNU Lesser General Public License, as published by the Free // Software Foundation, either version 3 of the License, or any later version. // // The HadronZoo C++ Class Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR // A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License along with the HadronZoo C++ Class Library. If not, see http://www.gnu.org/licenses. //
// // Management of HTML documents //
#include <fstream>
#include <sys/stat.h>
#include "hzChars.h" #include "hzTextproc.h" #include "hzDirectory.h" #include "hzDocument.h" #include "hzProcess.h"
using namespace std ;
/* ** Variables */
static hzMapS<hzString,hzHtagform> s_htagNam ; // All HTML tags by name static hzMapS<hzHtagtype,hzHtagform> s_htagTyp ; // All HTML tags by type
static hzHtagform s_tagformDuff ; // Null tag form static uint32_t s_htagPop ; // This is set by InitHtml() to the number of HTML tags, to indicate that the tags have been set up.
/* ** SECTION 1: HTML Tag Types */
hzEcode InitHtml (void) { // Category: Data Initialization // // Populate the map of tag names to tag forms and the map of tag types to tag forms (see hzHtagform definition). This facilitates HTML tag lookup for such // purposes as the import and processing of HTML documents. // // Arguments: None // // Returns: E_SETONCE If the HTML maps are already populated // E_OK If the operation was successful
_hzfunc(__func__) ;
if (s_htagPop) return E_SETONCE ;
hzHtagform t ; // Full tag info for insertion
// Default (invalid) t.klas=HTCLASS_NUL; t.rule=HTRULE_NULL; t.type=HTAG_NULL; t.name=(char*)0; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Page structure tags t.klas=HTCLASS_HDR; t.rule=HTRULE_SINGLE; t.type=HTAG_DOCTYPE; t.name="!DOCTYPE"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_HTML; t.name="html"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_HEAD; t.name="head"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_TITLE; t.name="title"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_META; t.name="meta"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_BODY; t.name="body"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_BASE; t.name="base"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_BASEFONT; t.name="basefont"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_STYLE; t.name="style"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Programing tags t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_SCRIPT; t.name="script"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_NOFRAMES; t.name="noframes"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_NOSCRIPT; t.name="noscript"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_HDR; t.rule=HTRULE_PAIRED; t.type=HTAG_APPLET; t.name="applet"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Frames t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_FRAME; t.name="frame"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_FRAMESET; t.name="frameset"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_IFRAME; t.name="iframe"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_SINGLE; t.type=HTAG_PARAM; t.name="param"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// System tags t.klas=HTCLASS_SYS; t.rule=HTRULE_PAIRED; t.type=HTAG_EMBED; t.name="embed"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_SYS; t.rule=HTRULE_PAIRED; t.type=HTAG_NOEMBED; t.name="noembed"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Font control or text tags - no content t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_BOLD; t.name="b"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_ULINE; t.name="u"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HATG_ITALIC; t.name="i"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_EM; t.name="em"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_STRONG; t.name="strong"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_CENTER; t.name="center"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_FONT; t.name="font"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_BIG; t.name="big"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_SMALL; t.name="small"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_SINGLE; t.type=HATG_BR; t.name="br"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_SINGLE; t.type=HTAG_HR; t.name="hr"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Text description tags t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_HEADER; t.name="header"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_FOOTER; t.name="footer"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_SECTION; t.name="section"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_ARTICLE; t.name="article"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_ASIDE; t.name="aside"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DETAILS; t.name="details"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_SUMMARY; t.name="summary"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DIALOG; t.name="dialog"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Text grouping tags t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_STRIKE; t.name="strike"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_S; t.name="s"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_DEL; t.name="del"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_INS; t.name="ins"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_KBD; t.name="kbd"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_TXT; t.rule=HTRULE_PAIRED; t.type=HTAG_SPAN; t.name="span"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Text control tags t.klas=HTCLASS_DAT; t.rule=HTRULE_OPTION; t.type=HTAG_PARAG; t.name="p"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_QUOTATION; t.name="q"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_H1; t.name="h1"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_H2; t.name="h2"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_H3; t.name="h3"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_H4; t.name="h4"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_H5; t.name="h5"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_H6; t.name="h6"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HATG_TT; t.name="tt"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_CODE; t.name="code"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_SAMP; t.name="samp"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_CITE; t.name="cite"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_CAPTION; t.name="caption"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_VAR; t.name="var"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_PRE; t.name="pre"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_BQ; t.name="bq"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_BLOCKQUOTE; t.name="blockquote"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAB_BDO; t.name="bdo"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_SUBSCRIPT; t.name="sub"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_SUPERSCRIPT; t.name="sup"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Data/layout tags t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TABLE; t.name="table"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_OPTION; t.type=HTAG_TCOL; t.name="col"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_OPTION; t.type=HTAG_TCOLGRP; t.name="colgroup"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TH; t.name="th"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TR; t.name="tr"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TBL_CEL; t.name="td"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DIV; t.name="div"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TBODY; t.name="tbody"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_THEAD; t.name="thead"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TFOOT; t.name="tfoot"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_FIELDSET; t.name="fieldset"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_LEGEND; t.name="legend"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_MENU; t.name="menu"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DT; t.name="dt"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DD; t.name="dd"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DFN; t.name="dfn"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DIR; t.name="dir"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_DLIST; t.name="dl"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_OLIST; t.name="ol"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_ULIST; t.name="ul"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_ITEM; t.name="li"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_HGROUP; t.name="hgroup"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_DAT; t.rule=HTRULE_PAIRED; t.type=HTAG_TIME; t.name="time"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Link tags t.klas=HTCLASS_LNK; t.rule=HTRULE_PAIRED; t.type=HTAG_ANCHOR; t.name="a"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_LNK; t.rule=HTRULE_PAIRED; t.type=HTAG_NAV; t.name="nav"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_LNK; t.rule=HTRULE_PAIRED; t.type=HTAG_LINK; t.name="link"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Input/form tags t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_FORM; t.name="form"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_SINGLE; t.type=HTAG_INPUT; t.name="input"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_TEXTAREA; t.name="textarea"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_SELECT; t.name="select"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_OPTGROUP; t.name="optgroup"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_OPTION; t.name="option"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_BUTTON; t.name="button"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INP; t.rule=HTRULE_PAIRED; t.type=HTAG_LABEL; t.name="label"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Information tags t.klas=HTCLASS_INF; t.rule=HTRULE_PAIRED; t.type=HTAG_ABBR; t.name="abbr"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INF; t.rule=HTRULE_PAIRED; t.type=HTAG_ACRONYM; t.name="acronym"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_INF; t.rule=HTRULE_PAIRED; t.type=HTAG_ADDRESS; t.name="address"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Image tags t.klas=HTCLASS_IMG; t.rule=HTRULE_SINGLE; t.type=HTAG_IMG; t.name="img"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_MAP; t.name="map"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_AREA; t.name="area"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_OBJECT; t.name="object"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_MARQUEE; t.name="marquee"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_CANVAS; t.name="canvas"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_FIGURE; t.name="figure"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_FIGCAPTION; t.name="figcaption" ; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Image SVG tags t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_SVG; t.name="svg"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_ANIMATE; t.name="animate"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_ANIMATEMOTION; t.name="animateMotion"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_ANIMATEXFORM; t.name="animateTransform"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_CIRCLE; t.name="circle"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_CLIPPATH; t.name="clipPath"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DEFS; t.name="defs"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DESC; t.name="desc"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DISCARD; t.name="discard"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_ELLIPSE; t.name="ellipse"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_BLEND; t.name="feBlend"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_COLORMATRIX; t.name="feColorMatrix"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_COMPONENTXFER; t.name="feComponentTransfer"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_COMPOSITE; t.name="feComposite"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_CONVOLVEMATRIX; t.name="feConvolveMatrix"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DIFFUSELIGHTING; t.name="feDiffuseLighting"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DISPLACEMENTMAP; t.name="feDisplacementMap"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DISTANTLIGHT; t.name="feDistantLight"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DROPSHADOW; t.name="feDropShadow"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FLOOD; t.name="feFlood"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FUNC_A; t.name="feFuncA"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FUNC_B; t.name="feFuncB"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FUNC_G; t.name="feFuncG"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FUNC_R; t.name="feFuncR"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_DEGAUSS; t.name="feGaussianBlur"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_IMAGE; t.name="feImage"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_MERGE; t.name="feMerge"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_MERGENODE; t.name="feMergeNode"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_MORPHOLOGY; t.name="feMorphology"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_OFFSET; t.name="feOffset"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_POINTLIGHT; t.name="fePointLight"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_SPECLIGHT; t.name="feSpecularLighting"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_SPOTLIGHT; t.name="feSpotLight"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_TITLE; t.name="feTile"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_TURBULENCE; t.name="feTurbulence"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FILTER; t.name="filter"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_FOREIGNOBJECT; t.name="foreignObject"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_GENERIC; t.name="g"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_HATCH; t.name="hatch"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_HATCHPATH; t.name="hatchpath"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_LINE; t.name="line"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_lINEARGRADIENT; t.name="linearGradient"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_MARKER; t.name="marker"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_MASK; t.name="mask"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_METADATA; t.name="metadata"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_MPATH; t.name="mpath"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_PATH; t.name="path"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_PATTERN; t.name="pattern"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_POLYGON; t.name="polygon"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_POLYLINE; t.name="polyline"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_RADIALGRADIENT; t.name="radialGradient"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_RECT; t.name="rect"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_SET; t.name="set"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_STOP; t.name="stop"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_SWITCH; t.name="switch"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_SYMBOL; t.name="symbol"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_TEXT; t.name="text"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_TEXTPATH; t.name="textPath"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_TSPAN; t.name="tspan"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_USE; t.name="use"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.type=HTAG_SVG_VIEW; t.name="view"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Audio/Video Tags t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_AUDIO; t.name="audio"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_SOURCE; t.name="source"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_TRACK; t.name="track"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t); t.klas=HTCLASS_IMG; t.rule=HTRULE_PAIRED; t.type=HTAG_VIDEO; t.name="video"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
// Third party tags t.klas=HTCLASS_3RD; t.rule=HTRULE_SINGLE; t.type=HTAG_FBLIKE; t.name="fb:like"; s_htagTyp.Insert(t.type,t); s_htagNam.Insert(t.name,t);
s_htagPop = s_htagNam.Count() ; return E_OK ; }
const char* Doctype2Txt (hzDoctype dtype) { // Category: Diagnostics // // Convert hzDoctype enum to text for diagnostics // // Arguments: 1) dtype The enumerated document type (either HTML or XML) // // Returns: Pointer to the doctype text form
static const char* strings [] = { "DOCTYPE_UNDEFINED", "DOCTYPE_HTML", "DOCTYPE_XML", "" } ;
if (dtype < 0 || dtype >= DOCTYPE_XML) return strings[0] ; return strings[dtype] ; } hzString Tagtype2Txt (hzHtagtype type) { // Category: Diagnostics // // Convert a HTML tag type (enum) into a string naming the type // // Arguments: 1) dtype The enumerated document type (either HTML or XML) // // Returns: Instance of hzString by value
// If tagmap not loaded, load it if (!s_htagNam.Count()) InitHtml() ;
if (type < HTAG_NULL) return s_tagformDuff.name ;
if (s_htagTyp.Count() <= (uint32_t) type) return s_tagformDuff.name ;
return s_htagTyp[type].name ; }
hzHtagtype Txt2Tagtype (const hzString& htag) { // Category: Config // // Convert a string representing a HTML tag type, into the HTML tag type. // // Arguments: 1) htag A string presumed to be one of the allowed HTML5 tags // // Returns: Enumerated hzHtagtype
_hzfunc(__func__) ;
hzHtagform tf ; // HTML tag info hzString S ; // HTML tag search string
// If tagmap not loaded, load it if (!s_htagPop) InitHtml() ;
S = htag ; S.ToLower() ;
tf = s_htagNam[S] ;
return tf.type ; }
const hzHtagform& TagLookup (const hzString& htag) { // Category: Internet // // Lookup and return the hzHtagform (tag function class). The search is by tagname. // // Arguments: 1) htag A string presumed to be one of the allowed HTML5 tags // // Returns: Reference to the tag form for the tag
// If tagmap not loaded, load it if (!s_htagNam.Count()) InitHtml() ;
return s_htagNam[htag] ; }
const hzHtagform& TagLookup (chIter& ci) { // Category: Internet // // Determine if the supplied chain iterator, is at the start of a legal HTML tag or anti-tag // // Arguments: 1) ci A chain iterator to be tested to see if it is at the begening of an allowed HTML5 // // Returns: Reference to the tag form for the tag
hzChain W ; // Working chain chIter xi ; // Internal chain iterator hzString word ; // Individual word
// If tagmap not loaded, load it if (!s_htagNam.Count()) InitHtml() ;
xi = ci ; if (*xi != CHAR_LESS) return s_tagformDuff ; xi++ ; if (*xi == CHAR_FWSLASH) xi++ ;
for (;;) { if (*xi == CHAR_SPACE) break ; if (*xi == CHAR_MORE) break ;
W.AddByte(*xi) ; xi++ ; }
word = W ; word.ToLower() ; return s_htagNam[word] ; }
/* ** Tag cleanup */
hzHtagInd AtHtmlTag (hzString& tagseq, chIter& ci) { // Category: Text Processing // // Determines if the supplied chain iterator marks the start of a sequence that amounts to a legal HTML tag or anti-tag. If it does not 0 is returned and // the supplied string will be empty. If the sequence has the right form, a case-insensitive lookup is performed to test the name part against all known // HTML5 tags. If this finds a match the supplied string will be populated with the sequence (including the opening and closing angle brackets). The return // value will then be either 1 for the tag or 2 for the anti-tag. // // Arguments: 1) tagseq If a tag is found, this string reference will be populated by it. // 2) ci The test chain iterator // // Returns: HTRULE_NULL If the sequence is not a known HTML tag or antitag. // HTRULE_PAIRED If the sequence is a HTML tag. // HTRULE_SINGLE If the sequence is a HTML antitag. // HTRULE_OPTION If the sequence is both a HTML tag and antitag (eg <br/>).
_hzfunc(__func__) ;
hzChain W ; // For building tagname chIter zi ; // Used to iterate whole tag sequence. hzHtagform tf ; // The tag form for the found tag (if any). hzString tagname ; // The tag name hzHtagInd retval ; // Return value (0 invalid, 1 tag, 2 anti-tag)
// If tagmap not loaded, load it if (!s_htagNam.Count()) InitHtml() ;
// Clear the supplied tag and set chain iter tagseq.Clear() ; zi = ci ;
if (*zi != CHAR_LESS) return HTAG_IND_NULL ;
zi++ ; if (*zi == CHAR_FWSLASH) { retval = HTAG_IND_ANTI ; zi++ ; } else retval = HTAG_IND_OPEN ;
for (; !zi.eof() && IsAlpha(*zi) ; zi++) W.AddByte(*zi) ; if (!W.Size()) return HTAG_IND_NULL ;
tagname = W ; W.Clear() ; tagname.ToLower() ; tf = s_htagNam[tagname] ; if (tf.type == HTAG_NULL) return HTAG_IND_NULL ;
// We have a HTML tag so build the complete tag for populating tagseq for (zi = ci ; !zi.eof() ; zi++) { W.AddByte(*zi) ;
if (*zi == CHAR_DQUOTE) { for (zi++ ; !zi.eof() ; zi++) { W.AddByte(*zi) ;
if (*zi == CHAR_BKSLASH) { zi++ ; W.AddByte(*zi) ; }
if (*zi == CHAR_DQUOTE) break ; } continue ; }
if (*zi == CHAR_FWSLASH) { if (zi == "/>") { retval = HTAG_IND_SELF ; zi++ ; W.AddByte(*zi) ; } }
if (*zi == CHAR_MORE) break ; } if (*zi != CHAR_MORE) return HTAG_IND_NULL ;
tagseq = W ; return retval ; }
void XmlCleanHtags (hzChain& output, const hzChain& input) { // Category: Text Processing // // Remove all instance of <, > and & and replace them with <, > and & respectively // // Arguments: 1) output The cleaned output // 2) input The unclean input // // Returns: None
chIter zi ; // Chain iterator uint32_t ent ; // Entity value (needed by call to AtEntity) uint32_t entLen ; // Entity value (needed by call to AtEntity)
for (zi = input ; !zi.eof() ; zi++) { if (*zi == CHAR_LESS) output << "<" ; else if (*zi == CHAR_MORE) output << ">" ; else if (*zi == CHAR_AMPSAND) { if (AtEntity(ent, entLen, zi)) output.AddByte(*zi) ; else output << "&" ; } else output.AddByte(*zi) ; } }
hzEcode hzDocument::Init (const hzUrl& url) { // Initialize a hzDocument with a URL // // Arguments: 1) url The URL of the document // // Returns: E_INITDUP If the document is already associated with a URL // E_OK If the document URL is set
_hzfunc("hzDocument::Init") ;
if (*m_Info.m_urlReq) { if (m_Info.m_urlReq == url) hzerr(E_INITDUP, "Duplicate call. Address already set to %s\n", *m_Info.m_urlReq) ; else hzerr(E_INITDUP, "Duplicate call. Addr=%s, arg=%s\n", *m_Info.m_urlReq, *url) ; return E_INITDUP ; }
m_Info.m_urlReq = url ;
return E_OK ; }
hzDocHtml::hzDocHtml (void) { m_pRoot = 0 ; m_pHead = 0 ; m_pBody = 0 ; _hzGlobal_Memstats.m_numDochtm++ ; }
hzDocHtml::~hzDocHtml (void) { _hzGlobal_Memstats.m_numDochtm-- ; Clear() ; }
hzHtmElem* hzDocHtml::_proctag (hzHtmElem* pParent, chIter& ci, hzHtagtype type) { // This assumes the chain iterator is currently at a '<' char and that this is the start of an HTML tag or ant-tag. To succeed the tag must be // both a known HTML tag and of the correct form. // // If successful, the iterator will be advanced to one place beyond the terminating '>'. If unsuccessful, the iterator will be left unchanged. // // Arguments: 1) The parent node // 2) The iterator. // 3) The current tag type. This determines how structural defects are to be handled.# // // Returns: Pointer to a new hzHtmElem if the operation was sussessful // NULL if function could not identify a tag // // Scope: Private to the hzDocHtml class.
_hzfunc("hzDocHtml::_proctag") ;
hzChain theTag ; // The full text of the tag hzChain Z ; // For building param names and values hzAttrset ai ; // Attribute iterator chIter end ; // End of tag marker chIter xi ; // Main operating chain iterator chIter yi ; // Shadow chain iterator hzHtmElem* pX ; // Parent element hzHtmElem* pNewnode ; // Tag found (new copy created) hzUrl link_url ; // URL for links //hzNumPair attr ; // Attribute name/value pair hzPair attr ; // Attribute name/value pair hzString tnam ; // Tag name hzString attrName ; // Attr name hzString attrValue ; // Attr value hzString S ; // Temporary string uint32_t nLine ; // Line number of tag hzHtagtype ptype ; // Parent tag's type
//bool bQuot = false ; bool bError = false ;
// Check validity of call if (ci.eof()) { threadLog("Invalid iterator\n") ; return 0 ; }
nLine = ci.Line() ;
if (*ci != '<') { threadLog("Line %d Wrong call\n", nLine) ; return 0 ; }
switch (type) { case HTAG_META: // Examininig a <META ...> tag case HTAG_STYLE: // Examininig a <META ...> tag case HTAG_SCRIPT: // Examininig a <SCRIPT .> tag case HTAG_LINK: // Examininig a <LINK ...> tag case HTAG_HTML: // Examininig a <HTML ...> tag (header) case HTAG_BODY: // Examininig a <BODY ...> tag (body) break ; default: break ; }
// Pre-process the tag and get tag name xi = ci ; xi++ ;
if (!IsAlpha(*xi)) { threadLog("Line %d Non-tag (< followed by non-alpha %d)\n", nLine, *xi) ; return 0 ; }
for (; !xi.eof() && (*xi == CHAR_COLON || IsAlphanum(*xi)) ; xi++) theTag.AddByte(*xi) ;
if (!theTag.Size()) { threadLog("Line %d Tag un-named\n", nLine) ; return 0 ; } tnam = theTag ;
// Check if tag is known as a HTML tag if (type == HTAG_TABLE) pNewnode = new hzHtmTbl() ; else pNewnode = new hzHtmElem() ; pNewnode->Init(this, pParent, tnam, type, m_vecTags.Count(), ci.Line()) ; m_vecTags.Add(pNewnode) ;
// Collect tag attributes if any for (; !xi.eof() ;) { if (IsWhite(*xi)) { xi++ ; continue ; }
if (*xi == CHAR_FWSLASH) { if (xi == "/>") { pNewnode->_setanti(xi.Line()) ; xi++ ; end = xi ; break ; } }
if (*xi == CHAR_MORE) { end = xi ; break ; }
// Not at end of tag, so should have attr=value sequence (otherwise error) if (!IsAlpha(*xi)) { threadLog("Line %d Error. Unexpected char is [%c]\n", nLine, *xi) ; xi++ ; continue ; }
Z.Clear() ; for (; !xi.eof() && (IsUrlnorm(*xi) || *xi == CHAR_COLON || *xi == CHAR_PERIOD || *xi == CHAR_MINUS || *xi == CHAR_USCORE) ; xi++) Z.AddByte(*xi) ; attrName = Z ;
Z.Clear() ; attrValue = (char*)0 ;
for (; !xi.eof() && IsWhite(*xi) ; xi++) ;
if (*xi != CHAR_EQUAL) { // Tag attribute does not have a value assignent part (="some_val"). This is an error although there are some slopy exceptions, // eg 'allowfullscreen' in the <tframe> tag.
if (pNewnode->Type() == HTAG_IFRAME || pNewnode->Type() == HTAG_TIME) { attrValue = attrName ; //pNewnode->AddAttr(attrName, attrValue) ;
//attr.m_A = m_Dict.Insert(*attrName) ; //attr.m_B = m_Dict.Insert(*attrValue) ; attr.name = attrName ; attr.value = attrValue ; m_NodeAttrs.Insert(pNewnode->GetUid(), attr) ;
continue ; }
threadLog("Line %d Tag %s param %s not assigned\n", nLine, *tnam, *attrName) ; return 0 ; }
// Get attribute value for (xi++ ; !xi.eof() && IsWhite(*xi) ; xi++) ;
Z.Clear() ; if (*xi == CHAR_DQUOTE) { for (xi++ ; !xi.eof() && *xi != CHAR_DQUOTE ; xi++) Z.AddByte(*xi) ; if (xi.eof()) { threadLog("Line %d Double-quote non-closure disqualifies tag\n", nLine) ; return 0 ; } xi++ ; } else if (*xi == CHAR_SQUOTE) { for (xi++ ; !xi.eof() && *xi != CHAR_SQUOTE ; xi++) Z.AddByte(*xi) ; if (xi.eof()) { threadLog("Line %d Single-quote non-closure disqualifies tag\n", nLine) ; return 0 ; } xi++ ; } else { for (; !xi.eof() && IsUrlresv(*xi) ; xi++) Z.AddByte(*xi) ; } attrValue = Z ;
// If the tag is a link/anchor and attr is named 'href' then add link to the list of links found in the page if ((pNewnode->Type() == HTAG_LINK || pNewnode->Type() == HTAG_ANCHOR) && attrName.Equiv("href")) { //threadLog("Considering link %s\n", *attrValue) ;
// Is the link a mailto ? if (!attrValue) { S = theTag ; threadLog("Line %d null link in tag %s\n", nLine, *S) ; } else { if (attrValue[0] != CHAR_HASH) { if (memcmp(*attrValue, "mailto:", 7) == 0) { S = *attrValue + 7 ; m_Emails.Insert(S) ; } else { // Add the link
if (m_Base && attrValue[0] == CHAR_FWSLASH) { link_url.SetValue(m_Base, attrValue) ; if (!link_url) threadLog("not a link case 1: %s\n", *attrValue) ; } else if (m_Info.Domain()) { link_url.SetValue(m_Info.Domain(), attrValue) ; if (!link_url) threadLog("not a link case 2: %s\n", *attrValue) ; } else { link_url = attrValue ; if (!link_url) threadLog("not a link case 3: %s\n", *attrValue) ; }
if (!link_url.Domain()) threadLog("not a link case 4: %s\n", *link_url) ;
if (link_url) { S = *link_url ;
//attr.m_A = m_Dict.Insert(*attrName) ; //attr.m_B = m_Dict.Insert(*attrValue) ; attr.name = attrName ; attr.value = attrValue ; m_NodeAttrs.Insert(pNewnode->GetUid(), attr) ;
if (!m_setLinks.Exists(link_url)) { m_setLinks.Insert(link_url) ; m_vecLinks.Add(link_url) ; } } } } } } else { //pNewnode->AddAttr(attrName, attrValue) ;
//attr.m_A = m_Dict.Insert(*attrName) ; //attr.m_B = m_Dict.Insert(*attrValue) ; attr.name = attrName ; attr.value = attrValue ; m_NodeAttrs.Insert(pNewnode->GetUid(), attr) ; } }
if (xi.eof()) { threadLog("Line %d A. non-closure disqualifies tag\n", nLine) ; return 0 ; }
if (*xi != CHAR_MORE) { S = theTag ; threadLog("Line %d C. malformed tag <%s> pnam=%s, attrValue=%s [%c]\n", nLine, *S, *attrName, *attrValue, *xi) ; return 0 ; }
for (xi++ ; !xi.eof() && IsWhite(*xi) ; xi++) ; end = xi ;
// Check for correct parentage if (pParent) { // Some tag-type rules
ptype = pParent->Type() ;
if (type == HTAG_TBL_CEL) { if (ptype == HTAG_TBL_CEL) { // This is where the author has forgotton to close a <td> and is now adding the next <td> in the row. We // seek back to the <tr> (the true parent).
threadLog("WARNING: Missing </td> anti-tag\n") ;
pX = pParent->Parent() ; if (pX) { ptype = pX->Type() ; if (ptype != HTAG_TH || ptype != HTAG_TR) pParent = pX ; } } }
if (bError) threadLog("WARNING: New <%s> tag has parent of <%s>\n", *Tagtype2Txt(type), *Tagtype2Txt(ptype)) ; }
ci = end ; //m_mapTags.Insert(pNewnode->Name(), pNewnode) ; return pNewnode ; }
hzEcode hzDocHtml::_htmPreproc (hzChain& Z) { // Remove comments and non applicable conditional comments from HTML // // Arguments: 1) Reference to chain to be pre-processed // // Returns: E_FORMAT If the HTML is malformed // E_OK If the HTML was successfully processed
_hzfunc("hzDocHtml::_htmPreproc") ;
chIter zi ; // Iterator of input hzChain X ; // Target chain hzChain word ; // Diagnostics chain bool bIn ; // In a conditional comment
if (Z.Size() == 0) return E_OK ;
for (zi = Z ; !zi.eof() ;) { if (*zi != CHAR_LESS) { X.AddByte(*zi) ; zi++ ; continue ; }
if (zi == "<!-->") { zi += 5 ; continue ; }
// Ignore deleted text within comment (<!-- and -->) tags. Note these cannot be nested bIn = false ;
if (zi == "<!--[if") { bIn = true ; zi += 7 ; } if (zi == "<![if") { bIn = true ; zi += 5 ; }
if (bIn) { for (; !zi.eof() && *zi <= CHAR_SPACE ;) zi++ ;
if (zi == "!IE") { // Specific non-IE comment. Content herein must be allowed through.
for (zi += 2 ; !zi.eof() && *zi != CHAR_MORE ; zi++) ; if (zi.eof()) { threadLog("Unterminated conditional comment (line %d)\n", zi.Line()) ; return E_FORMAT ; }
zi++ ; if (zi == "-->") zi += 3 ;
for (; !zi.eof() ; zi++) { if (*zi == CHAR_LESS) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } if (zi == "<![endif]>-->") { zi += 13 ; break ; } if (zi == "<!--<![endif]-->") { zi += 16 ; break ; } }
word.AddByte(*zi) ; X.AddByte(*zi) ; }
//threadLog("word is %s\n", *word //m_Error << "\nword is: " << word ; //m_Error.AddByte(CHAR_NL) ; word.Clear() ; continue ; }
if (zi == "!(") zi += 2 ;
if (zi == "lte IE" || zi == "lt IE" || zi == "gte IE" || zi == "gt IE" || zi == "IE") { // We are not and never will be IE so ignore conditional comment // threadLog("Stripping IE cond comment line %d - ", zi.Line()) ;
for (zi += 2 ; !zi.eof() ; zi++) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } } continue ; }
// Include non IE stuff // threadLog("Stripping non-IE cond comment line %d - ", zi.Line()) ;
for (zi += 2 ; !zi.eof() && *zi != CHAR_MORE ; zi++) ; if (zi.eof()) { threadLog("Unterminated conditional comment (line %d)\n", zi.Line()) ; return E_FORMAT ; }
zi++ ; if (zi == "<!-->") zi += 5 ; if (zi == "-->") zi += 3 ;
for (; !zi.eof() ; zi++) { if (*zi == CHAR_LESS) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } if (zi == "<![endif]>-->") { zi += 13 ; break ; } if (zi == "<!--<![endif]-->") { zi += 16 ; break ; } }
word.AddByte(*zi) ; X.AddByte(*zi) ; }
//m_Error << "\nword is: " << word ; //m_Error.AddByte(CHAR_NL) ; word.Clear() ; continue ; }
if (zi == "<!--") { for (zi += 4 ; !zi.eof() ; zi++) { if (zi == "-->") { zi += 3 ; break ; } }
if (zi.eof()) { threadLog("_htmPreproc. Unterminated normal comment starting on line %d\n", zi.Line()) ; return E_FORMAT ; } continue ; }
X.AddByte(*zi) ; zi++ ; }
if (X.Size() == Z.Size()) return E_OK ;
Z.Clear() ; Z = X ; return E_OK ; }
hzEcode hzDocHtml::Load (hzChain& Z) { // Populate the hzDocHtml object with HTML source code in the supplied chain. // // Two scenarios are permitted - Full or Partial as follows:- // 1) Full: If the HTML source has the <html> as its first tag it will be considered as a full page and tested as such. // It will be expected to have the standard sub-tags of <head> and <body> and thier corresponding anti-tags. // If either of these are missing or in error (malformed or containing unxpected or malformed tags) the HTML // source code is deemed to be syntactically in error and the load fails. // // 2) Partial: If the opening tag of the HTML source code is not the <html> tag it is viable only if it would be viable as // a HTML fragment that could be seemlessly inserted into the <body> part of a whole HTML page. This is to say // that all it's tags must be legal sub-tags of <body> and not of <head> and nor must the <body> or <head> tag // or anti-tag be present. // // In either case, tags are loaded into a tree of nodes (tags). The nodes/tags may be searched for and examined. // // Arguments: 1) Z The chain containing the HTML document // // Returns: E_FORMAT If the HTML was rejected by the the HTML pre-processor _htmlPreproc() OR if any tags could not be processed by _proctag() // E_OK If the HTML was loaded successfully // // Note: Unlike XML where tags are named so that content in the tree can be searched directly, the nodes in HTML are not named // named and so cannot be definitely referenced (they only have type). Some other process must apply application specific criteria // to read meaning into the data.
_hzfunc("hzDocHtml::Load") ;
hzChain nc ; // Node content hzChain T ; // For token building hzChain W ; // For token building chIter zi ; // Chain iterator chIter tw_start ; // Start of tagword marker chIter tmp ; // Start of tagword marker chIter limit ; // End of tag marker - Protection against malformed tags (NLA style) hzHtmElem* pCN = 0 ; // Current HTML node hzHtmElem* pNN ; // New HTML node hzHtmElem* pX ; // HTML node for diagnostics hzHtmElem* pCurForm = 0 ; // HTML node for diagnostics hzAttrset ai ; // Attribute iterator hzHtmForm* pForm = 0 ; // Form found in page hzPair P ; // Name value pair (for forms and fields) hzString strval ; // To test if current tag is being closed hzString tagword ; // From MakeTag - just the tagname. hzString wholetag ; // From MakeTag - the entire opening sequence if applicable hzString anam ; // Attribute name hzString aval ; // Attribute value hzHtagform tf ; // Tag form uint32_t nX ; // For nesting levels/general iteration uint32_t nColon ; // Does the tagname contain a colon (3rd party tag) uint32_t nLine ; // Line number for errors uint32_t quote ; // Are we in a quoted string bool bAnti ; // Tag is an anti-tag int32_t cDelim ; // Delimiting char (single/double quote) hzEcode rc = E_OK ; // return code
Clear() ; //m_Error.Clear() ;
// Pre-process the HTML rc = _htmPreproc(Z) ; if (rc != E_OK) return rc ;
m_Content = Z ;
// Make sure the HTML tags are loading into the lookup table if (!s_htagNam.Count()) InitHtml() ;
// Init the iterator zi = Z ; zi.Skipwhite() ;
// Bypass the doctype if present if (zi.Equiv("<!DOCTYPE")) { quote = 0 ; for (zi += 9 ; !zi.eof() ; zi++) { if (quote) { if (*zi == CHAR_DQUOTE) quote = 0 ; continue ; }
if (*zi == CHAR_MORE) { zi++ ; break ; }
if (*zi == CHAR_DQUOTE) quote = 1 ; }
zi.Skipwhite() ; }
// Look for the opening <html> for (; !zi.eof() ; zi++) { if (zi.Equiv("<html")) { m_pRoot = _proctag(0, zi, HTAG_HTML) ; if (!m_pRoot) { threadLog("Could not establist root node (the <html> tag)\n") ; return E_FORMAT ; } break ; } }
if (!m_pRoot) { threadLog("No valid contents found before expected <html> tag - assuming a partial page\n") ; zi = Z ; zi.Skipwhite() ; pCN = new hzHtmElem() ; pCN->Init(this, 0, tagword, HTAG_NULL, m_vecTags.Count(), zi.Line()) ; m_vecTags.Add(pCN) ; } else { // A <html> tag has been found so this is a full page. Look for <head> next for (; !zi.eof() ;) { if (zi.Equiv("<head")) { m_pHead = _proctag(m_pRoot, zi, HTAG_HEAD) ; if (!m_pHead) { threadLog("Could not process <head> tag\n") ; return E_FORMAT ; } break ; } zi++ ; }
if (!m_pHead) { threadLog("Expected a <head> tag\n") ; return E_FORMAT ; } pCN = m_pHead ;
// Now get the subtags of <head> for (; rc == E_OK && pCN && !zi.eof() ;) { // Handle tag content if (*zi != CHAR_LESS) { // Ignore certain constructs if (zi == "//") { for (zi += 2 ; !zi.eof() && *zi != CHAR_NL ; zi++) ; continue ; }
// If not part of a construct, just agregate the char to the current tag's content, striping leading whitespace if (*zi <= CHAR_SPACE && pCN->m_tmpContent.Size() == 0) { zi++ ; continue ; } pCN->m_tmpContent.AddByte(*zi) ; zi++ ; continue ; }
// Ignore deleted text within comment (<!-- and -->) tags. Note these cannot be nested nLine = zi.Line() ;
if (zi == "<!--[if") { for (zi += 7 ; !zi.eof() ; zi++) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } } continue ; }
if (zi == "<![if") { for (zi += 5 ; !zi.eof() ; zi++) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } } continue ; }
if (zi == "<!--") { for (zi += 4 ; !zi.eof() ; zi++) { if (zi == "-->") { zi += 3 ; break ; } } continue ; }
// Handle <![CDATA[...]]> block by converting the innards to straight data (apparently CDATA now legal in HTML) if (zi == "<![CDATA[") { for (zi += 9 ; !zi.eof() ; zi++) { if (zi == "]]>") { zi += 3 ; break ; } pCN->m_tmpContent.AddByte(*zi) ; } continue ; }
// Eliminate <noscript> tags from header (we don't use them) if (zi == "<noscript") { for (zi += 9 ; !zi.eof() ; zi++) { if (zi == "</noscript>") { zi += 11 ; break ; } } if (zi.eof()) { threadLog("Unclosed <noscript> block\n") ; rc = E_FORMAT ; break ; } continue ; }
// At this point we have the '<' start of tag char. Establish whole and tagword of possible HTML tag
wholetag.Clear() ; tagword.Clear() ; limit = zi ; limit++ ; W.AddByte(CHAR_LESS) ; bAnti = false ; if (*limit == CHAR_FWSLASH) { W.AddByte(CHAR_FWSLASH) ; bAnti = true ; limit++ ; }
nColon = 0 ; for (tw_start = limit ; !limit.eof() ; limit++) { if (*limit == CHAR_COLON || IsAlphanum(*limit)) { if (*limit == CHAR_COLON) nColon++ ;
T.AddByte(*limit) ; W.AddByte(*limit) ; continue ; } break ; } tagword = T ; T.Clear() ; for (; !limit.eof() ;) { W.AddByte(*limit) ;
if (*limit == CHAR_DQUOTE || *limit == CHAR_SQUOTE) { cDelim = *limit ;
for (limit++ ; !limit.eof() ; limit++) { if (*limit == CHAR_BKSLASH) { limit++ ; if (*limit == cDelim) continue ; } if (*limit == cDelim) break ; } }
if (*limit == CHAR_MORE) break ; limit++ ; }
wholetag = W ; W.Clear() ;
if (*limit != CHAR_MORE) { threadLog("Malformed tag (%s)\n", *wholetag) ; zi = limit ; continue ; } limit++ ;
//tagword.ToLower() ;
if (nColon) { if (!s_htagNam.Exists(tagword)) { tf.klas = HTCLASS_3RD ; tf.rule = HTRULE_OPTION ; tf.name = tagword ; s_htagTyp.Insert(tf.type, tf) ; s_htagNam.Insert(tf.name, tf) ; threadLog("Inserted 3rd party HTML tag %s\n", *tagword) ; } }
if (!s_htagNam.Exists(tagword)) { if (bAnti) threadLog("Line %d case 1 Unknown lookup anti-tag </%s> (%s)\n", zi.Line(), *tagword, *wholetag) ; else threadLog("Line %d Case 1 Unknown lookup tag <%s> (%d bytes)\n", zi.Line(), *tagword, wholetag.Length()) ;
pCN->m_tmpContent << wholetag ; zi = limit ; continue ; }
tf = s_htagNam[tagword] ;
if (tf.type == HTAG_NULL) { if (bAnti) threadLog("Line %d case 2 Unknown lookup anti-tag </%s> (%s)\n", zi.Line(), *tagword, *wholetag) ; else threadLog("Line %d Case 2 Unknown lookup tag <%s> (%d bytes)\n", zi.Line(), *tagword, wholetag.Length()) ;
pCN->m_tmpContent << wholetag ; zi = limit ; continue ; }
// Obtain tag name if (bAnti == false) { if (zi.Equiv("<title>")) { pCN = _proctag(m_pHead, zi, HTAG_TITLE) ; if (!pCN) { rc = E_FORMAT ; threadLog("Line %d Could not process <meta> tags\n", zi.Line()) ; } } else if (zi.Equiv("<meta")) { pCN = _proctag(m_pHead, zi, HTAG_META) ; if (!pCN) { rc = E_FORMAT ; threadLog("Line %d Could not process <meta> tags\n", zi.Line()) ; } } else if (zi.Equiv("<style")) { pCN = _proctag(m_pHead, zi, HTAG_STYLE) ; if (!pCN) { rc = E_FORMAT ; threadLog("Line %d Could not process <style> tags\n", zi.Line()) ; } } else if (zi.Equiv("<script")) { pCN = _proctag(m_pHead, zi, HTAG_SCRIPT) ; if (!pCN) { rc = E_FORMAT ; threadLog("Line %d Could not process <script> tags\n", zi.Line()) ; } } else if (zi.Equiv("<link")) { pCN = _proctag(m_pHead, zi, HTAG_LINK) ; if (!pCN) { rc = E_FORMAT ; threadLog("Line %d Could not process <link> tags\n", zi.Line()) ; } } else if (zi.Equiv("<base")) { pCN = _proctag(m_pHead, zi, HTAG_BASE) ; if (!pCN) { rc = E_FORMAT ; threadLog("Line %d Could not process <link> tags\n", zi.Line()) ; } // Set m_Base if (pCN->m_tmpContent.Size()) m_Base = pCN->m_tmpContent ; else { // set the m_Base to the first param ai = pCN ; if (ai.Value()) m_Base = ai.Value() ; // pAttr = pCN->GetFirstAttr() ; // if (pAttr) // m_Base = pAttr->value ; } } else { rc = E_FORMAT ; threadLog("Line %d Could not process <%s> tag within <head>\n", zi.Line(), *tagword) ; }
continue ; }
// Handle antitag if (bAnti) { if (zi.Equiv("</head>")) { zi += 7 ; break ; }
// Inactive (text rendering only) anti-tags if (tf.klas == HTCLASS_TXT) { pCN->m_tmpContent << wholetag ; zi = limit ; continue ; }
// { zi = limit ; continue ; }
zi = limit ;
if (pCN->Type() == tf.type || tf.rule == HTRULE_SINGLE) pCN = pCN->Parent() ; else { threadLog("case 1 Tag mis-match. Current highest tag is <%s id=%d, level=%d> but on line %d we have an anti-tag for %s\n", *Tagtype2Txt(pCN->Type()), pCN->GetUid(), pCN->Level(), zi.Line(), *Tagtype2Txt(tf.type)) ;
if (tf.rule == HTRULE_SINGLE) { //pCN = pX ; pCN = pCN->Parent() ; threadLog("Case 2 Corrected by allowing last tag as anti-tag\n") ; }
if (pCN->Type() == HTAG_TBL_CEL && tf.type == HTAG_TR) { for (pX = pCN ; pX ; pX = pX->Parent()) { if (pX->Type() == tf.type) { pCN = pX ; threadLog("Corrected by decending to level %d\n", pCN->Level()) ; break ; } } } } continue ; }
// If none of the above just advance zi++ ; }
// Advance to the <body> tag for (; !zi.eof() ;) { if (zi.Equiv("<body")) { m_pBody = _proctag(m_pRoot, zi, HTAG_BODY) ; if (!m_pBody) { threadLog("Expected an actual body\n") ; return E_FORMAT ; } break ; } zi++ ; }
if (!m_pBody) { threadLog("Expected a <body> tag\n") ; return E_FORMAT ; } pCN = m_pBody ; }
// // Process document body. Here everything is either a tag, an anti-tag or it is tag-content. Both tags and antitags begin with a '<' so the // raw HTML is iterated and whenever the < is found, it is tested for a known tag/antitag. In the general case of "<tag>content</tag>", the // process is to call _procTag() to parse the tag, garner the attributes and to create a new element (which the current element is then set // to). Bytes after the tag are agregated to the current element's content until the antitag occurs (at which point the current element is // then set back to the parent tag). // // The exceptions to the general case:- // // 1) Paragraph tags can be left open (antitag omited). These tags are closed by the parent antitag or by another paragraph tag. // // 2) Print control tags which are completely ignored. These can never become the current tag so any content they have is aggregated to // their parent tag. // // 3) Links which do become current, but will have thier content aggregated to the parent tag. //
for (; pCN && !zi.eof() ;) { // Handle tag content if (*zi != CHAR_LESS) { if (pCN->Type() != HTAG_ANCHOR) { if (*zi <= CHAR_SPACE && pCN->m_tmpContent.Size() == 0) { zi++ ; continue ; } pCN->m_tmpContent.AddByte(*zi) ; } else { if (pCN->Parent()) pCN->Parent()->m_tmpContent.AddByte(*zi) ; }
zi++ ; continue ; }
// Ignore deleted text within <strike></strike> tags nLine = zi.Line() ;
if (zi == "<strike>") { for (zi += 8 ; !zi.eof() ; zi++) { if (zi == "</strike>") { zi += 9 ; break ; } } if (zi.eof()) { threadLog("Unclosed comment block\n") ; rc = E_FORMAT ; break ; } continue ; }
if (zi == "<fb:like>") { for (zi += 9 ; !zi.eof() ; zi++) { if (zi == "</fb:like>") { zi += 10 ; break ; } } if (zi.eof()) { threadLog("Facebook special\n") ; rc = E_FORMAT ; break ; } continue ; }
if (zi == "<g:plusone>") { for (zi += 11 ; !zi.eof() ; zi++) { if (zi == "</g:plusone>") { zi += 12 ; break ; } } if (zi.eof()) { threadLog("Google special\n") ; rc = E_FORMAT ; break ; } continue ; }
// Ignore deleted text within comment (<!-- and -->) tags if (zi == "<!--[if") { for (zi += 7 ; !zi.eof() ; zi++) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } } if (zi.eof()) { threadLog("Unterminated <!--[if cond]..> tag starting line %d\n", nLine) ; rc = E_FORMAT ; break ; } continue ; }
if (zi == "<![if") { for (zi += 5 ; !zi.eof() ; zi++) { if (zi == "<![endif]>") { zi += 10 ; break ; } if (zi == "<![endif]-->") { zi += 12 ; break ; } } if (zi.eof()) { threadLog("Unterminated <![if cond]..> tag starting line %d\n", nLine) ; rc = E_FORMAT ; break ; } continue ; }
if (zi == "<!--") { for (zi += 4 ; !zi.eof() ; zi++) { if (zi == "-->") { zi += 3 ; break ; } } if (zi.eof()) { threadLog("Unterminated <!--> tag starting line %d\n", nLine) ; rc = E_FORMAT ; break ; } continue ; }
/* ** At this point we have the '<' start of tag char. Establish whole and tagword of possible HTML tag */
wholetag.Clear() ; tagword.Clear() ; limit = zi ; limit++ ; W.AddByte(CHAR_LESS) ; bAnti = false ; if (*limit == CHAR_FWSLASH) { W.AddByte(CHAR_FWSLASH) ; bAnti = true ; limit++ ; }
nColon = 0 ; for (tw_start = limit ; !limit.eof() ; limit++) { if (*limit == CHAR_COLON || IsAlphanum(*limit)) { if (*limit == CHAR_COLON) nColon++ ;
T.AddByte(*limit) ; W.AddByte(*limit) ; continue ; } break ; } tagword = T ; T.Clear() ; for (; !limit.eof() ;) { W.AddByte(*limit) ;
if (*limit == CHAR_DQUOTE || *limit == CHAR_SQUOTE) { cDelim = *limit ;
for (limit++ ; !limit.eof() ; limit++) { if (*limit == CHAR_BKSLASH) { limit++ ; if (*limit == cDelim) continue ; } if (*limit == cDelim) break ; } }
if (*limit == CHAR_MORE) break ; limit++ ; }
wholetag = W ; W.Clear() ;
if (*limit != CHAR_MORE) { threadLog("Malformed tag (%s)\n", *wholetag) ; zi = limit ; continue ; }
tagword.ToLower() ;
if (nColon) { if (!s_htagNam.Exists(tagword)) { tf.klas=HTCLASS_3RD ; tf.rule=HTRULE_OPTION ; tf.name = tagword ; s_htagTyp.Insert(tf.type, tf) ; s_htagNam.Insert(tf.name, tf) ; threadLog("Inserted 3rd party HTML tag %s\n", *tagword) ; } }
// if (bAnti) // threadLog("Case 2 line %d Doing antitag %s\n", zi.Line(), *tagword) ; // else // threadLog("Case 2 line %d Doing tag %s\n", zi.Line(), *tagword) ;
tf = s_htagNam[tagword] ;
if (tf.type == HTAG_NULL) { // Unrecognized tags are just made part of the content of the currently applicable tag
if (bAnti) threadLog("Line %d Unknown lookup anti-tag </%s> (%s)\n", zi.Line(), *tagword, *wholetag) ; else threadLog("Line %d Case 3 Unknown lookup tag <%s> (%d bytes)\n", zi.Line(), *tagword, wholetag.Length()) ;
pCN->m_tmpContent << wholetag ; zi = limit ; continue ; }
if (bAnti == false) { // Ignore graphic tags if (tf.klas == HTCLASS_IMG) { zi = limit ; continue ; }
// Ignore self-closed 'system' tags if (tf.klas == HTCLASS_SYS) { if (tf.type == HTAG_EMBED) pCN->m_tmpContent << "<embed/>" ;
if (tf.type == HTAG_NOEMBED) pCN->m_tmpContent << "<noembed/>" ;
for (; !zi.eof() ; zi++) { if (*zi == CHAR_MORE) { zi++ ; break ; } }
threadLog("Line %d Bypassed system tag <%s> (%s)\n", zi.Line(), *tagword, *wholetag) ; zi = limit ; continue ; }
// Handle HTCLASS_TXT 'in-content' tags. We just copy these through, complete with tag, antitag and content, to the content of the // current tag. However these tags should still be placed in the m_mapTags and m_vecTags member.
if (tf.klas == HTCLASS_TXT) // || tf.type == HTAG_ANCHOR) { pCN->m_tmpContent << wholetag ; zi = limit ; continue ; }
// If we are suppressing anchors, we only want the content of a <a href=...>...</a> sequence. //if (m_bOpflags & HDOC_SUPPRESS_LINKS && tf.klas == HTCLASS_LNK && tf.type == HTAG_ANCHOR) //if (bFlags & HDOC_ONLOAD_LINKS && tf.klas == HTCLASS_LNK && tf.type == HTAG_ANCHOR) // { zi = limit ; continue ; }
// Eliminate scripts (may revisit) if (zi.Equiv("<script")) { // plog->Out("%s. ignoring a script tag ...\n", __FUNCTION__) ;
for (tmp = zi ; !tmp.eof() ; tmp++) { if (tmp.Equiv("</script>")) { tmp += 9 ; zi = tmp ; break ; } } if (zi.eof()) { threadLog("Unclosed script tag\n") ; rc = E_FORMAT ; break ; } continue ; }
/* ** Process 'data structure' tags into nodes. These are tables (with there rows and columns) but also menus ** and ordered and unordered lists. */
pNN = 0 ; pNN = _proctag(pCN, zi, tf.type) ;
if (!pNN) { threadLog("No node allocated for tag <%s>\n", *Tagtype2Txt(tf.type)) ; return E_FORMAT ; }
pCN = pNN ;
zi = limit ;
/* ** Handle the <input> tag. As this is it's own anti-tag it has no content, only parameters. We need to include the tag ** in the tree as it is active, but we need to effect the anti-tag aspect as well (so the level is not raised) */
if (tf.type == HTAG_INPUT) pCN = pCN->Parent() ;
continue ; }
// Handle anti-tags if (bAnti) { // Inactive (text rendering only) anti-tags if (tf.klas == HTCLASS_TXT) // || tf.type == HTAG_ANCHOR) { pCN->m_tmpContent << wholetag ; zi = limit ; continue ; }
// Ignore self-closed 'system' tags if (tf.klas == HTCLASS_SYS) { if (tf.type == HTAG_EMBED) pCN->m_tmpContent << "</embed>" ;
if (tf.type == HTAG_NOEMBED) pCN->m_tmpContent << "</noembed>" ;
for (; !zi.eof() ; zi++) { if (*zi == CHAR_MORE) { zi++ ; break ; } }
threadLog("Line %d Bypassed system anti-tag <%s> (%s)\n", zi.Line(), *tagword, *wholetag) ; zi = limit ; continue ; }
zi = limit ;
if (pCN->Type() == tf.type || tf.rule == HTRULE_SINGLE) pCN = pCN->Parent() ; else { threadLog("case 2 Tag mis-match. Current highest tag is <%s id=%d, level=%d> but on line %d we have an anti-tag for %s\n", *Tagtype2Txt(pCN->Type()), pCN->GetUid(), pCN->Level(), zi.Line(), *Tagtype2Txt(tf.type)) ;
if (tf.rule == HTRULE_SINGLE) { //pCN = pX ; pCN = pCN->Parent() ; threadLog("Case 1 Corrected by allowing last tag as anti-tag\n") ; }
if (pCN->Type() == HTAG_TBL_CEL && tf.type == HTAG_TR) { for (pX = pCN ; pX ; pX = pX->Parent()) { if (pX->Type() == tf.type) { pCN = pX ; threadLog("Corrected by decending to level %d\n", pCN->Level()) ; break ; } } } } continue ; }
threadLog("HANDLING ABD %s (%s)\n", *tagword, *wholetag) ; }
if (pCN) threadLog("End of file encountered whilst inside tag definition\n") ;
// Move thru the tags in thier order of appearence and reduce where appropriate, the tag content held in chains to strings. Place forms in // the list of forms and place form field tags with thier host forms.
for (nX = 0 ; nX < m_vecTags.Count() ; nX++) { pX = m_vecTags[nX] ;
if (pX->Type() == HTAG_FORM) { // Add the form to to m_Forms and set this to the current form pCurForm = pX ; pForm = new hzHtmForm() ; m_Forms.Add(pForm) ; continue ; }
if (pCurForm) { if (pX->Type() == HTAG_INPUT) { // Add this field to the current form (report error if not in a current form) if (pX->Line() < pCurForm->Anti()) { P.name = pX->Name() ;
// for (pAttr = pX->GetFirstAttr() ; pAttr ; pAttr = pAttr->next) // { // if (pAttr->name == "value") // { P.value = pAttr->value ; break ; } // } for (ai = pX ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ;
if (anam == "value") { P.value = aval ; break ; } }
pForm->fields.Add(P) ; } continue ; }
if (pX->Line() > pCurForm->Anti()) pCurForm = 0 ; } }
threadLog("END OF LOAD page has %d links\n", m_vecLinks.Count()) ;
return rc ; }
hzEcode hzDocHtml::Load (const char* fpath) { // Loads an XML document into a tree of XML nodes // // Arguments: 1) fpath Source file of HTML document // // Returns: E_ARGUMENT If no file path is supplied // E_NOTFOUND If the file does not exist // E_NODATA If the file is empty // E_OPENFAIL If the file cannot be read // E_FORMAT If a format error caused the file load to fail // E_OK If the operation is successful
_hzfunc("hzDocXml::Load") ;
ifstream is ; // Input stream hzChain Z ; // Chain for holding file content hzEcode rc ; // Return code
// Load document into a working chain rc = OpenInputStrm(is, fpath) ; if (rc == E_OK) { Z << is ; is.close() ; rc = Load(Z) ; }
return rc ; }
hzHtmElem* hzHtmElem::GetFirstChild (void) const { _hzfunc("hzHtmElem::GetFirstChild") ;
if (!m_pHostDoc) hzexit(E_NOINIT, "Element %s: Node has no host document", *m_Name) ;
if (!m_Children) return 0 ;
return m_pHostDoc->m_arrNodes.InSitu(m_Children-1) ; }
hzHtmElem* hzHtmElem::Sibling (void) const { _hzfunc("hzHtmElem::Sibling") ;
if (!m_pHostDoc) hzexit(E_NOINIT, "Element %s: Node has no host document", *m_Name) ;
if (!m_Sibling) return 0 ;
return m_pHostDoc->m_arrNodes.InSitu(m_Sibling-1) ; }
hzHtmElem* hzHtmElem::Parent (void) const { _hzfunc("hzHtmElem::Parent") ;
if (!m_pHostDoc) hzexit(E_NOINIT, "Element %s: Node has no host document", *m_Name) ;
if (!m_Parent) return 0 ;
return m_pHostDoc->m_arrNodes.InSitu(m_Parent-1) ; }
hzDocHtml* hzHtmElem::GetTree (void) { // Return the HTML document whose tree of HTML elemnents this hzHtmElem is a part. We start at the current node and follow the parentage all the way back // to the base of the tree. // // Arguments: None // Returns: Pointer to root node of the tree to which the current node (element) belongs
hzHtmElem* pN ; // Current tree node
if (!m_Parent) Fatal("hzHtmElem::GetTree. 1. Tag %s (line %d, level %d) has no parent\n", *m_Name, m_nLine, m_nLevel) ;
for (pN = this ; pN->m_nLevel ; pN = pN->Parent()) ; if (!pN->m_Parent) Fatal("hzHtmElem::GetTree. 2. Tag %s (line %d, level %d) has no parent\n", *pN->m_Name, pN->m_nLine, pN->m_nLevel) ;
return (hzDocHtml*) pN->Parent() ; }
uint32_t hzHtmElem::_testnode (hzVect<hzHtmElem*>& tmpResult, const char* srchExp, uint32_t& nLimit, uint32_t nLevel, bool bLog) { // Recursive support function to the non-recursive FindSubnodes function. // // Split up first part of search expression (up to first period or null terminator), to a node/tag name and if present, a content speciifer // (="some_value"), an attribute name (->"attr_name") an attribute content specifer. // // We now apply the test to the current node and when required, to the children. We do not operate where nodes are at a higher // level than the limit. This is because the FindSubnodes function is looking for the set of nodes matching the search expression that are // found at the lowest level // // Arguments: 1) tmpResult Vector of HTML elements this function will add to // 2) srchExp HTML element selection criteria // 3) nLimit Depth limit for probing of child nodes // 4) nLevel Depth level of this HTML element // 5) bLog Print log flag // // Returns: Number of elements added during this call on this element
_hzfunc("hzHtmElem::_testnode") ;
hzChain Z ; // For extracting search expression components hzHtmElem* pNode ; // Node to be returned const char* i ; // Search expression iterator const char* cpNext = 0 ; // Next part of search expression if present hzAttrset ai ; // Attribute iterator hzString cont ; // Convert elemnet's content to temp string hzString reqNode_name ; // Required name of node hzString reqChild_name ; // Required name of node child hzString reqNode_cont ; // Required content of node hzString reqAttr_name ; // Required name of attribute hzString reqAttr_value ; // Required value of attribute hzString anam ; // Attribute name hzString aval ; // Attribute value uint32_t nTotal ; // Total nodes found matching search expression bool bFound ; // Does this node pass this part of search expression
// If we are already at too high a level, return if (nLimit && (m_nLevel > nLimit)) { if (bLog) threadLog("\t-> Out of range, returning 0\n") ; return 0 ; }
// Get required name of node for (i = srchExp ; IsAlpha(*i) ; i++) Z.AddByte(*i) ; reqNode_name = Z ; Z.Clear() ;
if (*i == CHAR_PERIOD) { i++ ; if (!IsAlpha(*i)) { if (bLog) threadLog("Malformed criteria (%s)\n", srchExp) ; return 0 ; }
cpNext = i ; for (; IsAlpha(*i) ; i++) Z.AddByte(*i) ; reqChild_name = Z ; Z.Clear() ; }
// Get name of attribute if applicable if (i[0] == CHAR_MINUS && i[1] == CHAR_MORE) { for (i += 2 ; IsUrlnorm(*i) ; i++) Z.AddByte(*i) ; reqAttr_name = Z ; Z.Clear() ; }
// An equal sign after the tag name specifies what the tag contents must be for the tag to qualify if (*i == CHAR_EQUAL) { for (i += 2 ; *i != CHAR_DQUOTE ; i++) Z.AddByte(*i) ; reqAttr_value = Z ; Z.Clear() ; }
/* if (bLog) { threadLog("On-node [%s] (%d) Testing node with reqNode_name=%s, reqChild_name=%s, reqAttr_name=%s, reqAttr_value=%s level=%d, slct=%s\n", *Lineage(), m_nLevel, *reqNode_name, *reqChild_name, *reqAttr_name, *reqAttr_value, nLevel, srchExp) ; for (pNode = m_Children ; pNode ; pNode = pNode->m_Sibling) threadLog("\t-> child: %s\n", *pNode->m_Name) ; } */
// Now we have the first part of the search expression, we test to see if this node meets this. If it does we still have to establish if // the remainder of the search expression (if it exists) is satisfied.
//pAttr = 0 ; bFound = false ;
if (m_Name == reqNode_name) { // We are on the specified node so if the value is not right, any named attribute does not exist or it does but with the // wrong value, we return a zero (to end the examination of this branch of nodes)
bFound = true ;
if (!reqChild_name) { // No child node has been specified so this node must be the last to check
if (reqNode_cont) { cont = m_tmpContent ; if (reqNode_cont != cont) return 0 ; }
if (bFound && reqAttr_name) { // See if we can find an attribute of the requrired name on this node for (ai = this ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ;
threadLog("Compare attr names (%s to param->name of %s)\n", *reqAttr_name, *anam) ; if (anam == reqAttr_name) { threadLog("Found a attr name match ") ; if (reqAttr_value) { if (reqAttr_value != aval) { threadLog("but not a pvalue match (%s not param->val of %s)\n", *reqAttr_value, *aval) ; continue ; } } threadLog(" - bingo\n") ; break ; } }
// if (!pAttr) // { threadLog("Oops - run out of params\n") ; return 0 ; } } } }
if (bFound) { /* ** Now we have passed the first part of the search expression, we can add this node to the results if there is no furthur search expression. But ** if there is, we have to establish if the remainder of the search expression is satisfied. This will nessesitate a recursive call of ** this function for each and every child of this node with the search expression pointer advanced. Only if at least one of these calls ** succeeds (returns a positive integer for nodes added to the result), can this call succeed. */
if (!cpNext) { //threadLog("\tMatched. Adding %s at level %d and position %d to array\n", *Lineage(), m_nLevel, tmpResult.Count()) ;
nLimit = m_nLevel ; tmpResult.Add(this) ; return 1 ; }
// Test children on the further search expression nTotal = 0 ; for (pNode = GetFirstChild() ; pNode ; pNode = pNode->Sibling()) { // if (!pNode->IsAncestor(this)) // Fatal("Case 2: Proported child failes to be ancestor of this\n") ;
if (nLimit && (pNode->m_nLevel > nLimit)) continue ;
nTotal += pNode->_testnode(tmpResult, cpNext, nLimit, nLevel + 1, bLog) ; } return nTotal ; }
/* ** This node does not have the required name and so does not meet the first part of the search expression. However a child might meet the ** search expression so we try each in turn. */
nTotal = 0 ; for (pNode = GetFirstChild() ; pNode ; pNode = pNode->Sibling()) { if (nLimit && (pNode->m_nLevel > nLimit)) continue ;
if (pNode->Name() == reqNode_name) nTotal += pNode->_testnode(tmpResult, srchExp, nLimit, nLevel + 1, bLog) ; }
return nTotal ; }
void hzHtmElem::FindSubnodes (hzVect<hzHtmElem*>& result, const char* srchExp, bool bLog) { // From the current node (the node used to call this member function), find all sub-nodes matching the supplied search expression. // // This function does not simply locate nodes that are children of the calling node whose name matches the supplied search expression. The aim is // to locate descenant nodes, however far down the tree they are. // // Note: The search expression will be of the form of one or more name-value pairs as follows:- // // 1) name="some_name"; - Only applies if the element is given an id which is often not the case // 2) type="html_tagtype"; - The element is of the right type, eg <table> // 3) class="class_value"; - The element has the given class value // 4) pname="param_name"; - The element has the parameter // 4) pvalue="param_value"; - The element has the parameter value // 6) cont="content_value"; - The element has contents of the given value // // Arguments: 1) elements The vector of elements found and in thier actual order of incidence. // 2) srchExp Search expression // 3) bLog Set if detailed logging is required // // Returns: None
hzDocHtml* pTree ; // The Tree holding this node uint32_t nLimit = 0 ; // Level limit
// Check we have a tree pTree = GetTree() ; if (!pTree) Fatal("No tree - aborting\n") ;
// Recursively call _testnode result.Clear() ; _testnode(result, srchExp, nLimit, 0, bLog) ; //threadLog("hzHtmElem::FindSubnodes: found %d results, set limit to %d\n", result.Count(), nLimit) ; }
uint32_t hzDocHtml::ExtractLinksBasic (hzVect<hzUrl>& links, const hzSet<hzString>& domains, const hzString& form) { // Find all links on a page lying within a set of acceptable domains and matching any supplied criteria. These are aggregated to the supplied vector of link // URLs. If no domains or criteria are supplied, all the links in the page will be aggregated. // // Note the links in a page are established in the Load() function. This function meerly filters them. It does not read the page content. // // Arguments: 1) links: The vector or set of URLs (links) found in the document // 2) domains: The set of domains that links must belong to in order to be included // 3) form: The search criteria is any // // Returns: Number of links that meet the supplied criteria
hzUrl link ; // URL of link uint32_t nIndex ; // Links iterator
links.Clear() ;
for (nIndex = 0 ; nIndex < m_vecLinks.Count() ; nIndex++) { link = m_vecLinks[nIndex] ;
// Ignore empty links (should not be any) if (!link) continue ;
// Ignore links to domains not on the list of acceptable domains (usually the website domain only) if (domains.Count()) { if (!domains.Exists(link.Domain())) continue ; }
// Now apply criteria if (form) { if (!FormCheckCstr(*link, *form)) continue ; }
links.Add(link) ; }
return links.Count() ; }
uint32_t hzDocHtml::ExtractLinksContent (hzMapS<hzUrl,hzString>& links, const hzSet<hzString>& domains, const hzString& criteria) { // Find all links on a page lying within a set of acceptable domains and matching any supplied criteria. These are aggregated to the supplied map of link // URLs to link content. If no domains or criteria are supplied, all the links in the page will be aggregated. // // Note the links in a page are established in the Load() function. This function meerly filters them. It does not read the page content. // // Arguments: 1) links: The vector or set of URLs (links) found in the document // 2) domains: The set of domains that links must belong to in order to be included // 3) form: The search criteria is any // // Returns: Number of links that meet the supplied criteria
hzHtmElem* pElement ; // HTML node hzAttrset ai ; // Attribute iterator hzString anam ; // Attribute name hzString S ; // Content of link node hzUrl link ; // URL of link uint32_t nIndex ; // Links iterator
links.Clear() ;
for (nIndex = 0 ; nIndex < m_vecTags.Count() ; nIndex++) { pElement = m_vecTags[nIndex] ;
if (pElement->Type() != HTAG_ANCHOR) continue ;
//for (pm = pElement->GetFirstAttr() ; pm ; pm = pm->next) for (ai = pElement ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ;
if (anam.Equiv("href")) { link = ai.Value() ;
// Ignore empty links (should not be any) if (!link) continue ;
// Ignore links to domains not on the list of acceptable domains (usually the website domain only) if (domains.Count()) { if (!domains.Exists(link.Domain())) continue ; }
// Enforce limiting criteria if (criteria) { if (!FormCheckCstr(*link, *criteria)) continue ; }
S = pElement->m_tmpContent ; links.Insert(link, S) ; } } }
return links.Count() ; }
hzEcode hzDocHtml::Import (const hzString& path) { // Loads an HTML document into a tree of HTML nodes // // Arguments: 1) path The full pathname of the file to load // // Returns: E_ARGUMENT If no file path is supplied // E_NOTFOUND If the file does not exist // E_NODATA If the file is empty // E_OPENFAIL If the file cannot be read // E_FORMAT If a format error caused the file load to fail // E_OK If the operation is successful
_hzfunc("hzDocHtml::Import") ;
ifstream is ; // Input stream hzChain Z ; // Chain for holding file content hzEcode rc ; // Return code
// Check path and load document rc = OpenInputStrm(is, path) ; if (rc == E_OK) { Z << is ; is.close() ;
rc = Load(Z) ; }
return rc ; }
void hzDocHtml::_report (hzLogger& xlog, hzHtmElem* node) { // Category: Diagnostics // // Recursive suport function for non-recursive hzDocHtml::Report // // Arguments: 1) xlog The logfile to write report to // 2) node The starting node // // Returns: None
hzHtmElem* pSub ; // Subnodes hzChain ult ; // Final version of node contents chIter x ; // Content iterator hzAttrset ai ; // Attribute iterator int n ; // Level iterator
if (!node) { xlog.Out("hzDocHtml::_report: ERROR No HTML element suppled\n") ; return ; }
/* ** Write out the opening of the tag */
xlog.Out("%2d: ", node->Level()) ; for (n = node->Level() ; n ; n--) xlog << ". " ;
xlog.Out("<%s", *Tagtype2Txt(node->Type())) ;
for (ai = node ; ai.Valid() ; ai.Advance()) xlog.Out(" %s=\"%s\"", ai.Name(), ai.Value()) ;
xlog << ">\n" ;
/* ** First visit higher level tags if any */
//pSub = node->FirstSubnode() ; pSub = node->GetFirstChild() ; if (pSub) { //for (; pSub ; pSub = pSub->NextSubnode()) for (; pSub ; pSub = pSub->Sibling()) _report(xlog, pSub) ; }
/* ** Then do content */
if (node->m_tmpContent.Size()) { for (x = node->m_tmpContent ; !x.eof() ; x++) { if (*x <= CHAR_SPACE) continue ; break ; } for (; !x.eof() ; x++) { if (x == "\r\n") { x++ ; continue ; } ult.AddByte(*x) ; }
if (ult.Size()) { xlog.Out("%2d: ", node->Level()) ; for (n = node->Level() ; n ; n--) xlog << " " ;
xlog << "[" << ult << "]\n" ; } }
/* ** Write out the closing of the tag */
xlog.Out("%2d: ", node->Level()) ; for (n = node->Level() ; n ; n--) xlog << ". " ; xlog.Out("</%s>\n", *Tagtype2Txt(node->Type())) ; }
void hzDocHtml::Report (hzLogger& xlog) { // Show list of nodes plus content // // Arguments: 1) xlog The logfile to write report to // Returns: None
_hzfunc("hzDocHtml::Report") ;
hzHtmElem* pE ; // Current node hzString S ; // Tag content holder uint32_t nIndex ; // Document tag iterator
if (!m_vecTags.Count()) xlog.Out("PAGE is EMPTY - No nodes in Vector\n") ; else { for (nIndex = 0 ; nIndex < m_vecTags.Count() ; nIndex++) { pE = m_vecTags[nIndex] ;
S = pE->m_tmpContent ;
xlog.Out("id=%d par=%d subs=%d nxt=%d lev=%d: %s [%s]\n", pE->GetUid(), pE->Parent() ? pE->Parent()->GetUid() : 0, pE->GetFirstChild() ? pE->GetFirstChild()->GetUid() : 0, pE->Sibling() ? pE->Sibling()->GetUid() : 0, pE->Level(), *Tagtype2Txt(pE->Type()), *S) ; } }
// Show tree of nodes plus content if (!m_pRoot) xlog.Out("PAGE is EMPTY - No subnodes of root\n") ; else _report(xlog, m_pRoot) ; }
hzEcode hzDocHtml::_xport (hzChain& Z, hzHtmElem* node) { // Recursive support function for hzDocHtml::Export. It exports the full tag (including attributes and content) of the supplied node and all // subnodes, to the supplied chain. // // Arguments: 1) Z The output chain // 2) node The current node // // Returns: E_ARGUMENT If no HTML element is supplied // E_OK If the operation was successful // // Note this is a support function for hzDocHtml::Export
hzChain ult ; // Final version of node contents chIter x ; // Content iterator hzHtmElem* pSub ; // Subnodes hzAttrset ai ; // Attribute iterator int n ; // Level iterator
if (!node) return E_ARGUMENT ;
// Write out the opening of the tag Z.Printf("%2d: ", node->Level()) ; for (n = node->Level() ; n ; n--) Z << ". " ; Z.Printf("<%s", *Tagtype2Txt(node->Type())) ;
for (ai = node ; ai.Valid() ; ai.Advance()) Z.Printf(" %s=\"%s\"", ai.Name(), ai.Value()) ; Z << ">\n" ;
// Then do content if (node->m_tmpContent.Size()) { for (x = node->m_tmpContent ; !x.eof() ; x++) { if (*x <= CHAR_SPACE) continue ; break ; } for (; !x.eof() ; x++) { if (x == "\r\n") { x++ ; continue ; } ult.AddByte(*x) ; }
if (ult.Size()) { Z.Printf("%2d: ", node->Level()) ; for (n = node->Level() ; n ; n--) Z << " " ;
Z.AddByte('[') ; Z << ult ; Z.AddByte(']') ; Z.AddByte(CHAR_NL) ; } }
// First visit higher level tags if any //pSub = node->FirstSubnode() ; pSub = node->GetFirstChild() ; if (pSub) { //for (; pSub ; pSub = pSub->NextSubnode()) for (; pSub ; pSub = pSub->Sibling()) _xport(Z, pSub) ; }
// Write out the closing of the tag Z.Printf("%2d: ", node->Level()) ; for (n = node->Level() ; n ; n--) Z << ". " ; Z.Printf("</%s>\n", *Tagtype2Txt(node->Type())) ; return E_OK ; }
hzEcode hzDocHtml::Export (const hzString& filepath) { // Exports a HTML page to a file named as per the supplied file path. // // Arguments: 1) filepath The file to export the HTML document to // // Returns: E_ARGUMENT If no export file path is supplied // E_NODATA If there is no HTML elements in the document // E_OPENFAIL If the supplied // E_WRITEFAIL If a write file occurs during export // E_OK If the export ran to completion
_hzfunc("hzDocHtml::Export") ;
ofstream os ; // Output stream hzChain Z ; // Working chain for output construction hzEcode rc = E_OK ; // Return code
if (!filepath) return hzerr(E_ARGUMENT, "No pathname supplied") ;
if (!m_pRoot) { if (!m_Content.Size()) return hzerr(E_NODATA, "Empty page (no root node). Nothing written to file %s\n", *filepath) ; }
// Dump out to file os.clear() ; os.open(*filepath) ; if (os.fail()) return hzerr(E_OPENFAIL, "Could not open file %s\n", *filepath) ;
if (m_Info.m_urlReq) Z.Printf("URL (req): %s\n", *m_Info.m_urlReq) ; if (*m_Info.m_urlAct) Z.Printf("URL (act): %s\n", *m_Info.m_urlAct) ; os << Z ; if (os.fail()) rc = E_WRITEFAIL ; Z.Clear() ;
if (rc == E_OK) { if (m_pRoot) rc = _xport(Z, m_pRoot) ; else Z = m_Content ;
os << Z ; if (os.fail()) rc = E_WRITEFAIL ; }
os.close() ; return rc ; }
void hzDocHtml::Clear (void) { // Recursively clear the tree of nodes // // Arguments: None // Returns: None
hzHtmElem* pNode ; // Node pointer uint32_t nIndex ; // Document tags iterator
for (nIndex = 0 ; nIndex < m_vecTags.Count() ; nIndex++) { pNode = m_vecTags[nIndex] ; delete pNode ; }
m_vecTags.Clear() ; m_vecLinks.Clear() ; m_setLinks.Clear() ; m_Emails.Clear() ;
m_pRoot = 0 ; m_pHead = 0 ; m_pBody = 0 ; }
hzEcode hzDocHtml::FindElements (hzVect<hzHtmElem*>& elements, hzString& htag, hzString& attrName, hzString& attrValue) { // Find all elements in a page with the given tag name and/or attribute and value. // // Arguments: 1) elements Elements found in order of incidence in this document matching on tag type and on attribute name and value if supplied. // 2) htag The tag type. This is compulsory and matches only elements of the given type. // 3) aname The attribute name. This is optional but if supplied, will require elements to have an attribute of the supplied name // 4) avalue The attribute value. Also optional but if supplied, will require elements to have an attribute of the supplied name // // Returns: E_NOTFOUND If no elements matched // E_OK If elements matched
hzHtmElem* pElement ; // HTML node hzAttrset ai ; // Attribute iterator hzString anam ; // Attribute name hzString aval ; // Attribute value hzString S ; // Content of link node hzUrl link ; // URL of link uint32_t Lo ; // First element in m_mapTags to investigate uint32_t Hi ; // Last element in m_mapTags to investigate uint32_t nIndex ; // Links iterator bool bOk ; // OK to insert the element
elements.Clear() ;
Lo = 0 ; Hi = m_mapTags.Count() - 1 ;
if (htag) { // A tagname has been supplied so limit the investigation to tags with the tagname Lo = m_mapTags.First(htag) ; if (Lo < 0) return E_NOTFOUND ; Hi = m_mapTags.Last(htag) ; }
// Investigate elements for (nIndex = Lo ; nIndex <= Hi ; nIndex++) { pElement = m_mapTags.GetObj(nIndex) ;
bOk = false ;
if (attrName) { // An attrubute name has been supplied so the element must have this attribute for (ai = pElement ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ;
if (anam == attrName) { if (!attrValue) bOk = true ; else { if (aval == attrValue) bOk = true ; } } } } else { if (attrValue) { // An attribute value ... for (ai = pElement ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ;
if (aval == attrValue) bOk = true ; } } }
if (bOk) //elements.Insert(pElement) ; elements.Add(pElement) ; }
return E_OK ; }
hzEcode hzDocHtml::FindElements (hzVect<hzHtmElem*>& elements, const char* srchExp) { // Find all tags meeting the supplied criteria and place pointers to the tags in the supplied results vector. // // Note: The criteria will be of the form of one or more name-value pairs as follows:- // // 1) name="some_name"; - Only applies if the element is given an id which is often not the case // 2) type="html_tagtype"; - The element is of the right type, eg <table> // 3) class="class_value"; - The element has the given class value // 4) pname="param_name"; - The element has the parameter // 4) pvalue="param_value"; - The element has the parameter value // 6) cont="content_value"; - The element has contents of the given value // // Arguments: 1) elements The vector of elements found and in thier actual order of incidence. // 2) srchExp Search expression // // Returns: E_NOTFOUND If no elements matched // E_OK If elements matched
_hzfunc("hzDocHtml::FindElements") ;
hzVect<hzString> list ; // List of tagnames forming required nod ancestry hzVect<hzHtmElem*> found ; // Nodes matching this
hzChain Z ; // For extracting tagnames etc hzHtmElem* pN ; // Element hzHtmElem* pK ; // Element child hzAttrset ai ; // Attribute iterator const char* i ; // For processing criteria hzString tnam ; // Tagname hzString knam ; // Child tagname (if any) hzString reqAttr_name ; // Attribute name (if any) hzString reqAttr_value ; // Attribute value (if any) uint32_t Lo ; // 1st element to investigate uint32_t Hi ; // Lst element to investigate uint32_t x ; // Element iterator uint32_t v ; // Element iterator uint32_t anc ; // Ancestry level
elements.Clear() ;
// Find node by name required name of node for (i = srchExp ; IsAlphanum(*i) ; i++) Z.AddByte(*i) ; tnam = Z ; Z.Clear() ; list.Add(tnam) ;
for (; *i == CHAR_PERIOD ;) { i++ ; if (!IsAlpha(*i)) return hzerr(E_FORMAT, "Malformed criteria (%s)\n", srchExp) ;
for (; IsAlphanum(*i) ; i++) Z.AddByte(*i) ; tnam = Z ; Z.Clear() ; list.Add(tnam) ; }
// Get name of attribute if applicable if (i[0] == CHAR_MINUS && i[1] == CHAR_MORE) { for (i += 2 ; IsUrlnorm(*i) ; i++) Z.AddByte(*i) ; reqAttr_name = Z ; Z.Clear() ; }
// An equal sign after the tag name specifies what the tag contents must be for the tag to qualify if (*i == CHAR_EQUAL) { for (i += 2 ; *i != CHAR_SQUOTE ; i++) Z.AddByte(*i) ; reqAttr_value = Z ; Z.Clear() ; }
anc = list.Count() ; if (anc) { // Look up the last tag in the m_mapTags anc-- ; tnam = list[anc] ;
Lo = m_mapTags.First(tnam) ; if (Lo < 0) return E_OK ; Hi = m_mapTags.Last(tnam) ;
threadLog("node (%d - %d) %s a=%s v=%s", Lo, Hi, *tnam, *reqAttr_name, *reqAttr_value) ;
for (x = Lo ; x <= Hi ; x++) { pN = m_mapTags.GetObj(x) ;
if (!anc) found.Add(pN) ; else { // Progress thru ancestry pK = pN->Parent() ; for (v = anc-1 ; pK && v >= 0 ; pK = pK->Parent(), v--) { threadLog("<- %s ", *pK->Name()) ; if (pK->Name() != list[v]) break ; } if (v < 0) { found.Add(pN) ; threadLog("OK ") ; } } }
// Check all found nodes for attribute criiteria for (x = 0 ; x < found.Count() ; x++) { pN = found[x] ;
if (!reqAttr_name && !reqAttr_value) elements.Add(pN) ; else { for (ai = pN ; ai.Valid() ; ai.Advance()) { if (reqAttr_name && reqAttr_name != ai.Name()) { threadLog("-1 ") ; continue ; } if (reqAttr_value && reqAttr_value != ai.Value()) { threadLog("-2 ") ; continue ; } elements.Add(pN) ; threadLog("+ ") ; break ; } } }
threadLog("done\n") ; } else { // Check all the nodes for attribute criteria for (x = 0 ; x < m_mapTags.Count() ; x++) { pN = m_mapTags.GetObj(x) ;
if (!reqAttr_name && !reqAttr_value) elements.Add(pN) ; else { for (ai = pN ; ai.Valid() ; ai.Advance()) { if (reqAttr_name && reqAttr_name != ai.Name()) continue ; if (reqAttr_value && reqAttr_value != ai.Value()) continue ; elements.Add(pN) ; break ; } } } }
return E_OK ; }
hzEcode hzDocHtml::_selectTag (hzSet<hzHtmElem*>& parents, hzSet<hzHtmElem*>& elements, const hzString& tagspec) { // Finds the set of tags meeting the supplied tag specifier. // // Arguments: 1) parents Set of parent tags // 2) elements Set of selected tags // 3) tagspec Tag selection criteria // // Returns: E_SYNTAX If the tag is malformed or illegal // E_OK If the tag is correct, even if no instances are found
_hzfunc("hzDocHtml::_selectTag") ;
hzMapS<hzString,hzString> pairs ; // List of attrs and attr values the tag must possess (if any)
hzChain word ; // Word extraction hzAttrset ai ; // Attribute iterator hzHtmElem* pE ; // HTML element (tag) hzHtmElem* pAnc ; // HTML element (tag) const char* i ; // For processing term hzString tagname ; // Name of tag sought hzString pnam ; // Name of attr sought hzString pval ; // Value of attr sought hzString anam ; // Attribute name hzString aval ; // Attribute value uint32_t nP ; // Name-value pair iterator uint32_t Lo ; // First incidence of tagname uint32_t Hi ; // Last incidence of tagname uint32_t nIndex ; // Tag iterator uint32_t nFound ; // All attributes found bool bFound ; // Ancestry test hzEcode rc = E_OK ; // Return code
elements.Clear() ;
/* ** Get tag name from the search criteria */
i = *tagspec ;
if (i[0] != CHAR_LESS) return hzerr(E_SYNTAX, "Term does not begin with an opening '<' char") ;
for (i++ ; IsAlphanum(*i) ; i++) word.AddByte(*i) ; tagname = word ; word.Clear() ;
if (!tagname) return hzerr(E_SYNTAX, "No tagname supplied") ;
/* ** Get attribute requirements from the search criteria */
for (; *i == CHAR_SPACE ;) { for (i++ ; *i && *i <= CHAR_SPACE ; i++) ; pnam = pval = (char*) 0 ;
for (; IsAlphanum(*i) ; i++) word.AddByte(*i) ; pnam = word ; word.Clear() ;
if (!pnam) { rc = E_SYNTAX ; threadLog("Attr name not supplied\n") ; break ; } if (*i != CHAR_EQUAL) { rc = E_SYNTAX ; threadLog("Attr name not followed by an assignment operator\n") ; break ; }
i++ ; if (*i == CHAR_ASTERISK) { i++ ; pval = "*" ; pairs.Insert(pnam, pval) ; continue ; }
if (*i != CHAR_SQUOTE) { rc = E_SYNTAX ; threadLog("Attr has no opening single quote\n") ; break ; } for (i++ ; *i && *i != CHAR_SQUOTE ; i++) word.AddByte(*i) ; if (*i != CHAR_SQUOTE) { rc = E_SYNTAX ; threadLog("Attr has no closing single quote\n") ; break ; } i++ ; pval = word ; word.Clear() ;
pairs.Insert(pnam, pval) ; }
if (rc != E_OK) return rc ;
if (*i != CHAR_MORE) { threadLog("Term does not end with a closing '<' char\n") ; return E_SYNTAX ; }
threadLog("Examining %d tags for tagnam=%s\n", m_vecTags.Count(), *tagname) ; for (nP = 0 ; nP < pairs.Count() ; nP++) { pnam = pairs.GetKey(nP) ; pval = pairs.GetObj(nP) ; threadLog(" - with %s=%s\n", *pnam, *pval) ; }
/* ** Get all tags in document with the tagname. It is not a failure if none found. */
Lo = m_mapTags.First(tagname) ; if (Lo < 0) threadLog("No matching tags for <%s>\n", *tagname) ; else { Hi = m_mapTags.Last(tagname) ;
for (nIndex = Lo ; nIndex <= Hi ; nIndex++) { pE = m_mapTags.GetObj(nIndex) ;
// Exclude elements with the wrong parent if (parents.Count()) { bFound = false ; for (pAnc = pE->Parent() ; pAnc ; pAnc = pAnc->Parent()) { if (parents.Exists(pAnc)) { bFound = true ; threadLog("Found parent of %p\n", pAnc) ; break ; }
threadLog("No such parent as %p\n", pAnc) ; }
if (!bFound) continue ; }
//if (parents.Count() && !parents.Exists(pE->Parent())) // continue ;
// No attribute/value pairs specified so the tag is added to the list if (!pairs.Count()) { elements.Insert(pE) ; continue ; }
nFound = 0 ; for (ai = pE ; ai.Valid() ; ai.Advance()) { anam = ai.Name() ; aval = ai.Value() ;
if (!pairs.Exists(anam)) continue ;
pval = pairs[anam] ;
if (pval == "*") { nFound++ ; continue ; } if (pval == aval) nFound++ ; }
// If there is a match on every attribute/value pair specified, add to the list if (nFound == pairs.Count()) elements.Insert(pE) ; } }
threadLog("Found %d tags for tagspec=[%s]\n", elements.Count(), *tagspec) ;
return rc ; }
hzEcode hzDocHtml::_selectTerm (hzSet<hzHtmElem*>& elements, const hzString& term) { // A 'term' within the context of HTML document tag selection, can be a specification of a single tag or it can specifiy multiple tags. In the latter case, // where multiple tag specifiers are concatenated, hierarchy is implied. // // Selection works on the basis of more detail, more tests. For example, the term <div> will populate the set of elements found with every <div> tag in the // document. The term <div class> will only find div tags with an attribute of 'class' while the term <div class="body"> will only find div tags that have // an attribute of class whose value is 'body'. It should be noted however, that tags are selected if they have what is asked for in the term. There is not // presently, any means to exclude tags if they have something we don't want them to have. // // A hierarchical concatenated term such as <div class='body'><p> will find every paragraph tag in the document whose parent tag is a div with an attribute // of class whose value is 'body'. If no div tags meet that criteria nothing will be selected. Likewise if div tags do meet the <div class="body"> test but // are not followed directly by the <p> tag, nothing is selected. // // Note that multiple tag terms are implemented by multiples calls to _selectTag, with the selection of tags found being reduced by each call. // // Arguments: 1) elements Set of lements selected by this function // 2) term Tag selection criteria // // Returns: E_SYNTAX If the tag is malformed or illegal // E_OK If the tag is correct, even if no instances are found
_hzfunc("hzDocHtml::_selectTerm") ;
hzSet<hzHtmElem*> parents ; // Parents hzArray<hzString> ar ; // Array of terms
uint32_t x ; // For populating reducedSet uint32_t t ; // Term count hzEcode rc ; // Return code
SplitCSV(ar, *term, CHAR_PLUS) ;
if (!ar.Count()) return hzerr(E_SYNTAX, "No tag specifiers found in term") ; threadLog("Term is %s (%d) components\n", *term, ar.Count()) ;
for (t = 0 ; t < ar.Count() ; t++) { threadLog("Term component %d: %s\n", t, *ar[t]) ; }
if (ar.Count() == 1) { // Call the _selectTag function once with the document's m_vecTags vector as the reduced set rc = _selectTag(parents, elements, ar[0]) ; return rc ; }
// There is more than one tag. Call the _selectTag function with no parents listed to start with and then repeatedly with the elements // found acting as the list of valid parents for the next call. rc = _selectTag(parents, elements, ar[0]) ; if (rc == E_OK) { if (elements.Count()) { for (t = 1 ; rc == E_OK && t < ar.Count() ; t++) { // Parents is the last tag's haul parents.Clear() ; for (x = 0 ; x < elements.Count() ; x++) parents.Insert(elements.GetObj(x)) ; rc = _selectTag(parents, elements, ar[t]) ; } } }
threadLog("Found %d tags for term=[%s]\n", elements.Count(), *term) ; return rc ; }
hzEcode hzDocHtml::_selectExp (hzSet<hzHtmElem*>& elements, const hzString& srchExp) { // Recursive support function for hzDocHtml::SelectElements (see below) // // Breaks up the expression into a term or 'term op expression' and calls _selectTerm to find the set of tags for each term. The terms can // be enclosed in parenthesis but individually, they take the form of tags enclosed in a <> block. The tag name is the first and often only // part but optionally after that, attributes may be specified. // // Arguments: 1) elements The set of elements elected (in order of tag type) // 2) srchExp Search expression // // Returns: E_SYNTAX If the expression is malformed // E_OK If the operation was successful (it still may have found no elements)
_hzfunc("hzDocHtml::_selectExp") ;
hzSet<hzHtmElem*> setA ; // Element set for first term hzSet<hzHtmElem*> setB ; // Element set for second term
hzChain word ; // Individual word hzHtmElem* pE ; // HTML element const char* i ; // For processing criteria hzString termA ; // First term hzString termB ; // Remainder of epression hzString expA ; // First term hzString expB ; // Remainder of epression uint32_t op ; // 1 for OR and 2 for AND uint32_t n ; // Counter uint32_t level ; // Parenthesis hzEcode rc = E_OK ; // Return code
/* ** Get 1st term */
for (i = *srchExp ; *i && *i <= CHAR_SPACE ; i++) ;
if (*i == '(') { level = 1 ; for (i++ ; level && *i >= CHAR_SPACE ; i++) { if (*i == '(') level++ ; if (*i == ')') level-- ;
if (level) word.AddByte(*i) ; }
expA = word ; } else if (*i == CHAR_LESS) { for (; *i == CHAR_LESS ;) { for (; *i != CHAR_MORE ; i++) word.AddByte(*i) ; word.AddByte(CHAR_MORE) ; i++ ; if (*i == CHAR_PLUS) { word.AddByte(CHAR_PLUS) ; i++ ; } }
termA = word ; } else { threadLog("Expected an opening '<'\n") ; rc = E_SYNTAX ; }
if (rc != E_OK) return rc ;
if (*i == 0) { // No further terms so populate element list with setA threadLog("Calling _selectTerm with a single exp [%s] term [%s]\n", *srchExp, *termA) ; if (expA) rc = _selectExp(elements, termA) ; if (termA) rc = _selectTerm(elements, termA) ; //for (n = 0 ; n < setA.Count() ; n++) // elements.Insert(setA.GetObj(n)) ; threadLog("case 1 Found %d tags for term=[%s]\n", elements.Count(), *srchExp) ; return rc ; }
/* ** Get operator */
for (; *i && *i <= CHAR_SPACE ; i++) ;
if (!CstrCompareI(i, "or")) { i += 2 ; op = 1 ; } else if (!CstrCompareI(i, "and")) { i += 3 ; op = 2 ; } else { threadLog("Illegal operator [%s]\n", i) ; return E_SYNTAX ; }
/* ** Get remainder of expression as second term */
for (; *i && *i <= CHAR_SPACE ; i++) ; word.Clear() ;
if (*i == '(') { level = 1 ; for (i++ ; level && *i >= CHAR_SPACE ; i++) { if (*i == '(') level++ ; if (*i == ')') level-- ;
if (level) word.AddByte(*i) ; }
expB = word ; } else if (*i == CHAR_LESS) { for (; *i == CHAR_LESS ;) { for (; *i != CHAR_MORE ; i++) word.AddByte(*i) ; word.AddByte(CHAR_MORE) ; i++ ; if (*i == CHAR_PLUS) { word.AddByte(CHAR_PLUS) ; i++ ; } }
termB = word ; } else { threadLog("Expected an opening '<'\n") ; rc = E_SYNTAX ; }
if (rc != E_OK) return rc ;
/* ** Apply operator */
threadLog("Calling _selectTerm with terms [%s:%s] and [%s:%s]\n", *expA, *termA, *expB, *termB) ;
if (expA) rc = _selectExp(setA, termA) ; if (termA) rc = _selectTerm(setA, termA) ;
if (expB) rc = _selectExp(setB, expB) ; if (termB) rc = _selectTerm(setB, termB) ;
if (op == 1) { threadLog("OR'ing\n") ;
for (n = 0 ; n < setA.Count() ; n++) elements.Insert(setA.GetObj(n)) ;
for (n = 0 ; n < setB.Count() ; n++) elements.Insert(setB.GetObj(n)) ;
threadLog("(total %d)\n", elements.Count()) ; } else { threadLog("AND'ing\n") ; for (n = 0 ; n < setA.Count() ; n++) { pE = setA.GetObj(n) ; if (setB.Exists(pE)) elements.Insert(pE) ; } }
threadLog("Found %d tags for term=[%s]\n", elements.Count(), *srchExp) ; return rc ; }
hzEcode hzDocHtml::FindElements (hzVect<hzHtmElem*>& elements, const hzString& srchExp) { // Select elements from this document according to the supplied search expression // // Webpages (HTML documents) commonly contain a lot of supurfluous matter whilst confining most information content to a limited set of elements (tags). If // it is known which element(s) contain what information (eg title, author, body content etc), FindElements can be used to select these element(s) and from // there, data can be efficiently extracted. // // Arguments: 1) The vector of HTML elements to be populated by this query. A vector is used in preference to a set as this ensures that // the elements found will be in the order of thier incidence in the HTML document. // 2) The criteria as a boolean expression of one or more terms, where each term specifies how elements are to be selected. // // Returns: E_SYNTAX If the expression is malformed // E_OK If the operation was successful (it still may have found no elements) // // Support functions: // // SelectElements() itself calls the private member function _selectExp to do the selecting. This places selected elements in a hzSet ordered // by their RAM address (this ensures tags are only counted once). SelectElements() then re-orders the elements from the hzSet into a hzVect. // // _selectExp (hzSet<hzHtmElem*>& elements, const hzString& exp) simply breaks up the expression into a term or 'term op expression' and calls // the second fupport function _selectTerm() to find the set of tags for each term. // // _selectTerm (hzSet<hzHtmElem*>& elements, const hzString& exp) deals only with terms designed to specify elements. Each term consists of one or // more tag specifiers, which when multiple, are separated by a + sign. A single tag specifier will identify a list of one or more tags within // the document. Subsequent tag specifiers will do the same but will limit the search to descendents of the tags found under the previous tag // specifier. The _selectTerm() calls the third support function _selectTag() on each tag specifier in turn, to actually do the selecting. // // _selectTag (hzSet<hzHtmElem*>& parents, hzSet<hzHtmElem*>& elements, const hzString& exp) uses a single tag specifier to select tags from the // HTML document and then if a list of parents (previously found tags) is supplied the selected tags are tested to ensure they have an ancestor // among the list of parents. // // Each tag specifier will be encased in a <> block and be of the general form <tagname attr1='value1' attr2='value2' ...> where either the tag // name or at least one attribute must exist. If an attribute is specified the tag must match on the attribute be be selected. Wildcards can be // used as well.
_hzfunc("hzDocHtml::FindElements") ;
hzMapS<uint32_t,hzHtmElem*> ord ; // Ordered set hzSet<hzHtmElem*> res ; // Results
hzHtmElem* pE ; // The HTML element (tag) uint32_t x ; // Result set iterator hzEcode rc ; // Return code
elements.Clear() ; if (!srchExp) return E_OK ;
// Get expression rc = _selectExp(res, srchExp) ; if (rc != E_OK) { threadLog("Failed\n") ; return rc ; }
// Assemble results for (x = 0 ; x < res.Count() ; x++) { pE = res.GetObj(x) ; ord.Insert(pE->GetUid(), pE) ; }
for (x = 0 ; x < res.Count() ; x++) { pE = ord.GetObj(x) ; elements.Add(pE) ; }
threadLog("Got %d elements\n", res.Count()) ; return E_OK ; }
/* ** Section 2: hzHtmElem members */
hzEcode hzHtmElem::Init (hzDocHtml* pRoot, hzHtmElem* pParent, hzString& tagname, hzHtagtype type, uint32_t id, uint32_t line) { // Initialize a HTML element (tag) to the parent element (if any), the tag type. Set also the id and line number (within the HTML // in question) // // Arguments: 1) pRoot Pointer to the HTML document root // 2) pParent Pointer to the parent element of this // 3) tagname The name of this tag // 4) htag HTML Tag type // 5) id Numeric identifier // 6) line Line number of tag in the source HTML file // // Returns: E_ARGUMENT If no root is supplied // E_OK If the HTML element was initialized
_hzfunc("hzHtmElem::Init") ;
if (!pRoot) { hzerr(E_ARGUMENT, "No root supplied") ; return E_ARGUMENT ; }
if (!pParent) { m_Parent = 0 ; m_nLevel = 0 ; } else { m_Parent = pParent->GetUid() ; m_nLevel = pParent->m_nLevel + 1 ; pParent->_addnode(this) ; }
m_Name = tagname ; m_Type = type ; m_Uid = id ; m_nLine = line ;
m_Children = 0 ; m_Sibling = 0 ;
return E_OK ; }
hzEcode hzHtmElem::_addnode (hzHtmElem* pNode) { // Adds an element as a subnode of this. Subnodes are always appended. // // Arguments: 1) pNode Element to add as child of this element // // Returns: E_ARGUMENT If no element is supplied // E_DUPLICATE If the supplied element is actually this element // E_OK If the element is added as child
_hzfunc("hzHtmElem::_addnode") ;
hzHtmElem* p_temp ; // Current node pointer
if (!pNode) return hzerr(E_ARGUMENT, "Attempt to add a null node") ; if (pNode == this) return hzerr(E_DUPLICATE, "Attempt to add a node to itself (%s)", *m_Name) ;
if (!m_Children) m_Children = pNode->GetUid() ; else { for (p_temp = GetFirstChild() ; p_temp->m_Sibling ; p_temp = p_temp->Sibling()) { if (pNode == p_temp) return hzerr(E_DUPLICATE, "Attempt to add an already existing node to %s", *m_Name) ; } p_temp->m_Sibling = pNode->GetUid() ; }
m_nSubnodes++ ; return E_OK ; }
/* ** Section 2A: hzHtmlTable members */
uint32_t hzHtmTbl::Colcount (void) { // Establishes the number of column headers. If there are no <th> headers there will still be columns. // // Method is to check if there has been an edit (any additional tags) since the last report (of either row or column count). If not then the value held in // m_NoCols is returned. Otherwise the columns are counted explicitly. In the absence of the row of table headers, the column count will be the row with // the maximum number of columns. // // Arguments: None // Returns: Number of columns
hzHtmElem* pE ; // Table row tags hzHtmElem* pC ; // Columns
if (!m_nCols) { pE = GetFirstChild() ;
for (pC = pE->GetFirstChild() ; pC ; pC = pC->Sibling()) { if (pC->Type() != HTAG_TH) continue ; m_nCols++ ; } } return m_nCols ; }
uint32_t hzHtmTbl::Rowcount (void) { // Returns the number of rows. This will not include the row of headers. // // Arguments: None // Returns: Number of rows in the table
if (!m_nSubnodes) { //threadLog("Table is empty\n") ; return 0 ; }
if (!m_nCols) { if (!m_nRows) Colcount() ;
if (!m_nCols) m_nRows = m_nSubnodes ; else m_nRows = m_nSubnodes - 1 ; } return m_nRows ; }
hzString hzHtmTbl::GetColl (uint32_t nCol) { // Return the value (string) of the requested column // // In the case of a table, the only allowed sub-nodes are <tr> nodes. The columns for the table are all under the table's first <tr> sub-node as <th> nodes. // // Arguments: 1) nCol The column number // // Returns: Instance of hzString by value - of the table row as a concatenated series of <td>content</td>
hzHtmElem* pE ; // Table row tags hzHtmElem* pC ; // Columns hzString S ; // Target string uint32_t nIndex ; // Column iterator
if (!m_Children) return S ; pE = GetFirstChild() ;
if (!pE->GetFirstChild()) return S ;
nIndex = 0 ; for (pC = pE->GetFirstChild() ; pC ; pC = pC->Sibling()) { if (pC->Type() != HTAG_THEAD) continue ;
if (nIndex == nCol) { S = pC->m_tmpContent ; break ; }
nIndex++ ; }
return S ; }
hzString hzHtmTbl::GetCell (uint32_t nRow, uint32_t nCol) { // Return the cell from the supplied row and column. // // Method is to move thru the table's <tr> subnodes to get to the row, then move thur that row's <td> (or equivelent) tags to get to the column within the row (the cell). // // Arguments: 1) nRow The row number // 2) nCol The column number // // Returns: Instance of hzString by value - of the table cell
hzHtmElem* pR ; // Table row tags hzHtmElem* pC ; // Columns hzString S ; // Target string uint32_t row = -1 ; // Row counter uint32_t col = 0 ; // Column counter
if (!m_Children) { S = "No child nodes" ; return S ; } if (!m_nCols) { S = "No columns" ; return S ; }
for (pR = GetFirstChild() ; row <= nRow && pR ; row++, pR = pR->Sibling()) { if (row < nRow) continue ;
for (pC = pR->GetFirstChild() ; col <= nCol && pC ; col++, pC = pC->Sibling()) { if (col < nCol) continue ;
S = pC->m_tmpContent ; break ; } break ; }
return S ; }
/* ** Non-member functions */
hzDoctype DeriveDoctype (hzChain& Z) { // Category: Text Processing // // Rudimentary check to determine if the document is HTML or XML. // // Argument: Z Input document // // Returns: The doctype
chIter zi ; // Chain iterator
for (zi = Z ; !zi.eof() && *zi != CHAR_LESS ; zi++) ;
if (zi.Equiv("<html")) return DOCTYPE_HTML ;
if (zi.Equiv("<!DOCTYPE ")) { zi += 10 ; if (zi.Equiv("html")) return DOCTYPE_HTML ; if (zi.Equiv("xml")) return DOCTYPE_XML ; }
if (zi.Equiv("<?xml")) return DOCTYPE_XML ; return DOCTYPE_UNDEFINED ; }