HTML DOM Parser

migajek

2005-12-08 13:02:12
2005-12-08 13:02:12
9 komentarzy
2576 odsłon

Ponieważ mój ostatni parser HTML był dosyć dziwnie zbudowany, w dodatku umożliwiał tylko zdobycie właściwości konkretnego taga, napisałem kolejny ... tym razem nie jest to tylko parser, ale również inspektor DOM (dzięki któremu możemy zobaczyć strukturę dokumentów [patrz: demo]). Dodatkowo, wbudowany parser pozwala na wydobycie wszystkich właściwości konkretnego taga.

Kod źródłowy jest komentowany tak sobie ;) [czytaj: tak zebym sam zrozumiał], a omawiać mi sie nie chce... jak ktoś nie rozumie - czekam na pytania w komentarzach ;) jak rozumie - czekam na oceny :]

Ogólnie wygląda to następująco : klasa TMigHTMLParser po utworzeniu (z kodem HTML w parametrze) rozpoczyna parsowanie. Po ukończeniu w zmiennej Root mamy do dyspozycji "główny" tag (powinien być to tag <HTML>) jako obiekt typu TMigHTMLTag. Własności tego obiektu to
TagName : nazwa taga (np. html, body itp)
TagContent: zawartość (wyłącznie tekst)
Parent: rodzic (może być pusty)
List: lista tagów wewnątrz
Jak widać, za sprawą ostatniej wartości "łańcuch" może ciągnąć się w nieskończoność :)

Na pierwszy rzut oka może się to wydawać trudne, dlatego zamieszczam demo użycia.

Jeśli chcesz użyć tego w programie - powiadom mnie o tym lub zamieść w programie informację o użyciu tego kodu :)

(*--------------------------------------------|
| HTML DOM Parser  v1.2                       |
|---------------------------------------------|
| author: Michał Gajek                        |
| web: http://www.migajek.com                 |
| email: [email protected]                    |
| Copyright ?  2005 by Michał Gajek           |
|---------------------------------------------|
| Released under the terms and conditions of  |
| the GNU General public License (Version 2)  |
+--------------------------------------------*)

unit MigHTMLParser;

interface
uses SysUtils, Classes;

//tagi nie wymagajace zamkniecia
const
 STags : array[0..5] of string = ('br','hr','img','link','input','meta');
type
  //parametr tagu (nazwa, wartosc)
  TMigHTMLParam = record
   fName, fVal:string;
  end;

  //deklaracja klasy
  TMigHTMLTag = class;

  //lista parametrow taga
  TMigHTMLParams = class
//   private
    fList: array of TMigHTMLParam;
//   public
    procedure Add(AName, AValue: string);        //dodaj
    function GetByName(AName: string):string;    //pobierz z nazwy
    function GetByIndex(AIndex: integer):string; //pobierz z indexu
    function Count:integer;                      //count = high (fList)
  end;

  //lista tagow HTML
  TMigHTMLTagList = class
   private
    fList: array of TMigHTMLTag;
   public
    procedure Add(ATag: TMigHTMLTag);
    function GetByIndex(AIndex: integer):TMigHTMLTag;
    function Count:integer;
   end;

  //jeden tag
  TMigHTMLTag = class
   TagName:string;      //nazwa taga
   TagContent: TStringList;  //zawartosc tekstowa taga. kolejne elementy sa dodawane po kolejnych tagach
   Params: TMigHTMLParams; //parametry
   Parent: TMigHTMLTag;    //rodzic, dopuszczalny nil
   List: TMigHTMLTagList;  //lista "dzieci"
   constructor Create;
  end;

  TMigHTMLParser = class
   private
    fPos:integer;
    fText: string;
    function GetTextToNextTag:string;
    procedure Parse();
    function ParseParam():string;
    function StripComments():boolean;
   public
    Root: TMigHTMLTag;
    constructor Create(AText:string);
  end;


implementation

{=========== TMigHTMLParams ===============}

procedure TMigHTMLParams.Add(AName, AValue: string);
begin
SetLength(fList,length(fList)+1);
with fList[High(fList)] do
 begin
  fName:= AName;
  fVal := AValue;
 end;
end;


function TMigHTMLParams.GetByIndex(AIndex: integer):string;
begin
result:='';
if (AIndex <= high(fList) ) and (AIndex >= low(fList)) then
 result:=fList[AIndex].fVal;
end;

function TMigHTMLParams.GetByName(AName: string):string;
var
i:integer;
begin
AName:= lowercase(AName);
result:= '';
for i:=low(fList) to high(fList) do
 if lowercase(fList[i].fName) = AName then
  begin
   result:= fList[i].fVal;
   break;
  end;
end;

function TMigHTMLParams.Count:integer;
begin
result:= High(fList);
end;



{=========== TMigTagList ===============}


procedure TMigHTMLTagList.Add(ATag: TMigHTMLTag);
begin
SetLength(fList, length(fList) + 1);
fList[High(fList)] := ATag;
end;

function TMigHTMLTagList.GetByIndex(AIndex: integer):TMigHTMLTag;
begin
if AIndex > High(fList) then
 result:=nil
else
 result:=fList[AIndex];
end;

function TMigHTMLTagList.Count:integer;
begin
result:= high(fList);
end;


{=========== TMigHTMLTag ===============}


constructor TMigHTMLTag.Create;
begin
TagName := '';
Parent:= nil;
List:= TMigHTMLTagList.Create;
Params:= TMigHTMLParams.Create;
TagContent:= TStringList.Create;
end;



{=========== TMigHTMLParser ===============}

constructor TMigHTMLParser.Create(AText:string);
begin
fPos:= 1;
fText:= AText;
Parse();
end;

function TMigHTMLParser.StripComments():boolean;
begin
result:=false;
if (copy(fText,fPos,4) = '<!--') or (lowercase(copy(fText,fPos,9)) = '<!doctype') then
 begin
 result := true;
 while fText[fPos] <> '>' do
  inc(fPos);
 end
else if lowercase(copy(ftext,fPos,7)) = '<script' then
 begin
  result:=true; 
  while copy(ftext,fpos-8,8) <> '/script>' do
   inc(fPos);
 end
end;

function TMigHTMLParser.GetTextToNextTag:string;
begin
result:='';
if fText[fPos] = '>' then Inc(fPos);
while fPos<= length(fText) do
 begin
  if fText[fPos] = '<' then
   begin
    dec(fPos);
    break;
   end;

  if not (fText[fPos] in [#13,#10]) then
   result:=result+ fText[fPos];
  inc(fPos);
 end;
end;

procedure TMigHTMLParser.Parse;
var
tmpCnt,L_token:string;
CurrParent, tmp: TMigHTMLTag;

function __inSTags(s:string):boolean;
var
_i:integer;
begin
result:=false;
for _i:=low(sTags) to high(stags) do
 if lowercase(s) = lowercase(sTags[_i]) then
  begin
   result:= true;
   break;
  end;
end;

begin
CurrParent:=nil;
tmp:= nil;
while fPos <= length(fText) do
 begin
  case fText[fPos] of
   '<':if not StripComments() then begin
        if fText[fPos+1]<>'/' then    //jesli to nie jest zakonczenie
        begin
         if CurrParent = nil then
           CurrParent:= TMigHTMLTag.Create
         else
          begin
           tmp:= CurrParent;
           CurrParent:= TMigHTMLtag.Create;
           CurrParent.Parent := tmp;
           tmp.List.Add(CurrParent); //dodaj do listy dzieci obecny tag
          end;
          l_token:='';
        end
        else      ///jesli zakonczenie
         begin
          if CurrParent.parent <> nil then
           CurrParent:=CurrParent.parent;
         end;

       end;
   '>' :
          begin
           if CurrParent.TagName = '' then
            CurrParent.TagName:=l_token;   //ustaw nazwe jesli jeszcze nie ma

           if (length(l_token) > 0) and (l_token[1] <> '/') then  //jesli nie jest to zakonczenie
            CurrParent.TagContent.Text:=GetTextToNextTag       //to ustaw tekst
           else
            CurrParent.TagContent.Add(GetTextToNextTag);

           if (fText[fPos-1] = '/') or (__inSTags(l_token)) then
            if CurrParent.parent <> nil then
            CurrParent:=CurrParent.parent;

           if __inSTags(CurrParent.TagName) then
            CurrParent:=CurrParent.Parent;

           l_token:= '';
          end;
   ' ' : begin
          if CurrParent.TagName = '' then
           begin
            CurrParent.TagName:=l_token;
            l_token :='';
           end; 
         end;
   '=' : begin
          CurrParent.Params.Add(l_token,ParseParam());
          l_token:='';
         end;
   else l_token:=l_token+fText[fPos];
  end;
  inc(fPos);
 end;


Root:= CurrParent;
end;

function TMigHTMLParser.ParseParam():string;
var
l_token:string;
bchar:set of char;
begin
result:= '';

//usuwanie spacji po znaku =
if fText[fpos] = '=' then
  while fText[fPos+1]=' ' do
   inc(fPos);

if fText[fPos+1] = '"' then
 begin
  bchar:=['"','>'];
  inc(fPos,2);
 end
else if fText[fPos+1] = '''' then
 begin
  bchar:=['''','>'];
  inc(fPos,2);
 end
else
 begin
 bchar:=[' ','>'];
 inc(fPos, 1);
 end;

while fPos <= length(fText) do
 begin
   if (fText[fPos] in bchar) then
    begin
     result:=(l_token);
     break;
    end
   else
   l_token:=l_token+fText[fPos];

  inc(fPos);
 end;
end;
end.