unit XMLUtilities;

{
[XMLUtilities] [1.2]
Delphi 2005
October 2006

LICENSE

The contents of this file are subject to the Mozilla Public License Version
1.1 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
"http://www.mozilla.org/MPL/"

Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
the specific language governing rights and limitations under the License.

The Original Code is "[XMLUtilities.pas]".

The Initial Developer of the Original Code is Martin Holmes (Victoria,
BC, Canada, "http://www.mholmes.com/"). Copyright (C) 2005 Martin Holmes 
and the University of Victoria Computing and Media Centre. The code was 
co-developed for university and personal projects, and rights are shared
by Martin Holmes and the University of Victoria. All Rights Reserved.
}

{
Written by Martin Holmes, starting October 2005.

Utility functions using the MSXML libraries distributed with Internet Explorer,
currently incorporating functions checking the well-formedness of blocks of XML
code. The use of MSXML is for speed and convenience during the development
process; eventually we may be able to use alternative open-source libraries
for this.

 TntUnicode libraries (Troy Wolbrink).
 XDOM_4_1 (Dieter Köhler)
 JEDI Component Library (Project JEDI)
}

interface

uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, TntForms, xmldom, XMLIntf, msxmldom, XMLDoc, TntDialogs, StdCtrls,
  TntStdCtrls, XDOM_4_1, TntSysUtils, XMLRulesUtils, TntClasses, jclUnicode,
  Math, cUnicodeCodecs;

{XDOM 3.2 removed TXMLToDomParser.ParseWideString (Grrrr!) and there's no
useful replacement, so we may in future need a cracker class to get at the
protected TXmlToDomParser.ParseFragment.}

//type TXmlToDomParserCracker = class(TXmlToDomParser);

type
  TufrmXMLUtilities = class(TTntForm)
    XMLDoc: TXMLDocument;
    umsgXMLNotWellFormed: TTntStaticText;
    XSLDoc: TXMLDocument;
    XHTMLDoc: TXMLDocument;
  private
    procedure SplitXMLAtTagBoundaries(var SList: TTntStringList;
      Input: WideString);
    function EscapeEntities(Input: WideString): WideString;
    function MakeCloseTagFromOpenTag(OpenTag: WideString): WideString;


    { Private declarations }
  public
    { Public declarations }
    function CheckXMLWellFormedness(InXML: WideString; var ErrorLineNum,
                        ErrorLinePos: integer; var ErrorLine: WideString;
                        ShowReport: Boolean): Boolean;
    function CheckXMLFileWellFormedness(TheFileName: WideString; var ErrorLine,
      ErrorLinePos: integer; ShowReport: Boolean): Boolean;

    function CloseTagMatchesOpenTag(CloseTag, OpenTag: WideString): Boolean;
    function GetTagNameFromTag(Tag: WideString): WideString;
    function FindTagType(Tag: WideString): integer;
    function AttributeIsWellFormed(Attr: WideString): Boolean;
    function AttributesAreWellFormed(Atts: WideString): Boolean;
    function FixImproperlyNestedTags(XMLFragment: WideString): WideString;
    function CreateXMLFileWrapper(FilePath, XMLFile, XSLFile: WideString): Boolean;
    function MakeXMLNCName(var wsSource: WideString): Boolean;

    function DoXSLTransformation(InXMLDoc, InXSLDoc: WideString): WideString;
    function DoXSLTransformationFromFiles(InXMLDocFile,
      InXSLDocFile: WideString): WideString;

    function IsWellFormedXMLFragment(InXML: WideString): Boolean;
    function IsWellFormedXMLTree(InXML: WideString): Boolean;

    function AddIndentToNode(El: TDomElement; DomDoc: TDomDocument; NumSpaces: integer): Boolean;
    function ParseWideStringToDomNode(InXML: WideString;
                                      DomImpl: TDomImplementation;
                                      TargetDoc: TDomDocument): TDomNode;
  end;

var
  ufrmXMLUtilities: TufrmXMLUtilities;

const
  XMLOpenTag = 0;
  XMLCloseTag = 1;
  XMLSelfClosingTag = 2;
  NotXMLTag = -1;

implementation

{$R *.dfm}

{ TufrmXMLUtilities }

//This function modelled on one from Marco Cantù's Matering Delphi 2005 p 864.
function TufrmXMLUtilities.CheckXMLWellFormedness(InXML: WideString;
        var ErrorLineNum, ErrorLinePos: integer; var ErrorLine: WideString;
        ShowReport: Boolean): Boolean;
var
eParse: IDOMParseError;
Messidge: WideString;
uslLines: TTntStringList;
ProblemLine: WideString;

begin
  Result := True; //default if no error
  XMLDoc.Active := True;
  (XMLDoc as IXMLDocumentAccess).DOMPersist.loadxml(InXML);
  eParse := (XMLDoc.DomDocument as IDOMParseError);
  if eParse.errorCode <> 0 then
    with eParse do
      begin
        ErrorLineNum := Line;
        ErrorLinePos := LinePos;
        ErrorLine := '';
        uslLines := TTntStringList.Create;
        try
          uslLines.Text := InXML;
          if (ErrorLineNum > 0) and (ErrorLineNum <= uslLines.Count) then
            ErrorLine := uslLines[ErrorLineNum-1];
        finally
          FreeAndNil(uslLines);
        end;
        if ShowReport then
          begin

            Messidge := WideFormat(umsgXMLNotWellFormed.Caption, [IntToStr(Line), IntToStr(LinePos)]);
            if Length(ProblemLine) > 0 then
              Messidge := Messidge + #13#10#13#10 + ProblemLine + #13#10;
            Messidge := Messidge + #13#10 + Reason;
            WideMessageDlg(Messidge, mtWarning, [mbOK], 0);
          end;
        Result := False;
      end;
end;

function TufrmXMLUtilities.CheckXMLFileWellFormedness(TheFileName: WideString;
        var ErrorLine, ErrorLinePos: integer; ShowReport: Boolean): Boolean;
var
eParse: IDOMParseError;
Messidge: WideString;
FileData: WideString;

begin
  Result := True; //default if no error
  XMLDoc.Active := True;
  XMLDoc.LoadFromFile(TheFileName);
  eParse := (XMLDoc.DomDocument as IDOMParseError);
  if eParse.errorCode <> 0 then
    with eParse do
      begin
        ErrorLine := Line;
        ErrorLinePos := LinePos;
        if ShowReport then
          begin
            Messidge := WideFormat(umsgXMLNotWellFormed.Caption, [IntToStr(Line), IntToStr(LinePos)]);
            Messidge := Messidge + #13#10 + Reason;
            WideMessageDlg(Messidge, mtWarning, [mbOK], 0);
          end;
        Result := False;
      end;
end;

function TufrmXMLUtilities.AttributeIsWellFormed(Attr: WideString): Boolean;
var
AttrName, AttrVal: WideString;
i: integer;
WC: WideChar;

begin
  Result := False; //default
  Attr := WideTrim(Attr);
//There is a minimum length
  if Length(Attr) < 5 then
    Exit;
  i := Pos(WideString('='), Attr);
  if (i < 2) or (i > Length(Attr)-3) then
    Exit;
  AttrName := Copy(Attr, 1, i-1);
  if not IsXMLNmToken(AttrName) then
    Exit;
  AttrVal := Copy(Attr, i+1, Length(Attr)-i);
  WC := AttrVal[1];
  if not ((WC = WideChar('"')) or (WC = WideChar(''''))) then
    Exit;
  WC := AttrVal[Length(AttrVal)];
  if not ((WC = WideChar('"')) or (WC = WideChar(''''))) then
    Exit;
  Result := True; //passed all the tests!
end;

function TufrmXMLUtilities.AttributesAreWellFormed(Atts: WideString): Boolean;
var
i: integer;
Attr: WideString;
InSingleQuotes, InDoubleQuotes: Boolean;

begin
  Result := True; //default to true in this case -- it's simpler
  Atts := WideTrim(Atts);
  if Length(Atts) < 5 then
    Exit;
  i := 1;
  InSingleQuotes := False;
  InDoubleQuotes := False;
  Attr := '';
  while i <= Length(Atts) do
    begin
      Attr := Attr + Atts[i];
      if Atts[i] = WideChar('''') then //single quote
        begin
          if InSingleQuotes then //it's the end of a name/value pair
            begin
              if not AttributeIsWellFormed(Attr) then
                begin
                  Result := False;
                  Exit;
                end;
              InSingleQuotes := False;
              Attr := '';
            end
          else
            begin
              if not InDoubleQuotes then
                InSingleQuotes := True;
            end;
        end
      else
        if Atts[i] = WideChar('"') then //double quote
          begin
            if InDoubleQuotes then //it's the end of a name/value pair
              begin
                if not AttributeIsWellFormed(Attr) then
                  begin
                    Result := False;
                    Exit;
                  end;
                InDoubleQuotes := False;
                Attr := '';
              end
            else
              begin
                if not InSingleQuotes then
                  InDoubleQuotes := True;
              end;
          end;
      inc(i);
    end;
//If there's anything left over it must be ill-formed
    if Length(Attr) > 0 then
      Result := False;
end;

function TufrmXMLUtilities.FindTagType(Tag: WideString): integer;
var
StartAtts, EndAtts: integer;
Attributes: WideString;

begin
  Result := NotXMLTag; //default
  if Tag[1] = WideChar('<') then
    if Tag[Length(Tag)] = WideChar('>') then
      begin
//Check if it's a close tag
        if Tag[2] = WideChar('/') then
          if Length(Tag) > 3 then
            begin
              if IsXMLNmToken(Copy(Tag, 3, Length(Tag)-3)) then
                begin
                  Result := XMLCloseTag;
                  Exit;
                end;
            end;

//Check if it's an open tag with no attributes
        if IsXMLNmToken(Copy(Tag, 2, Length(Tag)-2)) then
          begin
            Result := XMLOpenTag;
            Exit;
          end;

//Check if it's a self-closing tag with no attributes
        if Tag[Length(Tag)-1] = WideChar('/') then
          if IsXMLNmToken(WideTrim(Copy(Tag, 2, Length(Tag)-3))) then
            begin
              Result := XMLSelfClosingTag;
              Exit;
            end;

//May be an open tag or a self-closing tag with attributes
        if not IsXMLNmToken(Copy(Tag, 2, Pos(WideString(' '), Tag)-2)) then
          Exit;

//Now we have to check the attributes
        StartAtts := Pos(WideString(' '), Tag);
        EndAtts := Length(Tag) - 1;
        while not ((Tag[EndAtts] = WideChar('"')) or (Tag[EndAtts] = WideChar(''''))) and
                  (EndAtts > 1) do
          dec(EndAtts);
        if EndAtts < StartAtts+4 then
          Exit; //not enough for even a single minimal well-formed attribute
        Attributes := Copy(Tag, StartAtts, EndAtts-(StartAtts-1));
        if not AttributesAreWellFormed(Attributes) then
          Exit;

//So it's passed all these tests -- now find out which kind of tag it is
        if Tag[Length(Tag)-1] = WideChar('/') then
          Result := XMLSelfClosingTag
        else
          Result := XMLOpenTag;
      end;
end;

function TufrmXMLUtilities.GetTagNameFromTag(Tag: WideString): WideString;
var
StartName, EndName: integer;
begin
  Result := ''; //default
  Tag := WideTrim(Tag);
//Is it an open or a close tag
  if Tag[2] = WideChar('/') then
    StartName := 3
  else
    StartName := 2;
  EndName := StartName;
  while IsXMLNameChar(Tag[EndName]) do
    inc(EndName);
  Result := Copy(Tag, StartName, EndName-StartName);
end;

function TufrmXMLUtilities.CloseTagMatchesOpenTag(CloseTag, OpenTag: WideString): Boolean;
begin
  Result := GetTagNameFromTag(CloseTag) = GetTagNameFromTag(OpenTag);
end;

procedure TufrmXMLUtilities.SplitXMLAtTagBoundaries(var SList: TTntStringList; Input: WideString);
var
i: integer;
wsTemp: WideString;

begin
  SList.Clear;
  wsTemp := '';
  for i := 1 to Length(Input) do
    begin
//Check for the beginning of a tag
      if (Input[i] = WideChar('<')) and (Length(wsTemp) > 0) then
        begin
          SList.Add(wsTemp);
          wsTemp := '';
        end;

      wsTemp := wsTemp + Input[i];

//Check for the end of a tag
      if Input[i] = WideChar('>') then
        begin
          SList.Add(wsTemp);
          wsTemp := '';
        end;
    end;
//Add anything left over
  if Length(wsTemp) > 0 then
    SList.Add(wsTemp);
end;

function TufrmXMLUtilities.EscapeEntities(Input: WideString): WideString;
var
i: integer;
j: integer;

begin
  Result := '';
  if Length(Input) > 0 then
    for i := 1 to Length(Input) do
      if Input[i] = WideChar('<') then
        Result := Result + WideString('&lt;')
      else
        if Input[i] = WideChar('>') then
          Result := Result + WideString('&gt;')
        else
          if Input[i] = '&' then
            begin
              j := i;
              while (j <= Length(Input)) do
                begin
                  if (Input[j] = ' ') or (j = Length(Input)) then
                    begin
                      Result := Result + '&amp;';
                      Break;
                    end;
                  if Input[j] = ';' then
                    begin
                      Result := Result + '&';
                      Break;
                    end;
                  inc(j);
                end;
            end
          else
            Result := Result + Input[i];
end;

function TufrmXMLUtilities.MakeCloseTagFromOpenTag(OpenTag: WideString): WideString;
var
TagName: WideString;

begin
  TagName := GetTagNameFromTag(OpenTag);
  Result := WideString('</') + TagName + WideString('>');
end;

{The purpose of this function is to make a block of text into a well-formed
XML fragment by detecting and trying to fix improper nesting problems.
The idea is to parse through the string, tracking which tags are open.
When we encounter a close tag which doesn't match the last open tag, we
simply ignore it, leaving the tag open. Then, at the end, we close all
open tags. This doesn't necessarily create the RIGHT structure, but it
should create a well-formed structure, which may help a user to find and
correct errors.
}
function TufrmXMLUtilities.FixImproperlyNestedTags(XMLFragment: WideString): WideString;
var
i, j, k: integer;
ParseStack: TTntStringList;
TagStack: TTntStringList;
TagType: integer;

begin
  Result := '';
  ParseStack := TTntStringList.Create;
  try
//Split the whole into separate elements
    SplitXMLAtTagBoundaries(ParseStack, XMLFragment);
    if ParseStack.Count < 1 then
      Exit;

    TagStack := TTntStringList.Create;
    try
//Parse through the stack, flagging open tags
      for i := 0 to ParseStack.Count-1 do
        begin
          TagType := FindTagType(ParseStack[i]);
          Case TagType of
            XMLOpenTag:
              begin
                TagStack.Add(ParseStack[i]);
                Result := Result + ParseStack[i];
              end;
            XMLCloseTag:
              begin
//If any tags are open
                if TagStack.Count > 0 then
                  begin
//if the last opened one matches this closer
                    if CloseTagMatchesOpenTag(ParseStack[i], TagStack[TagStack.Count-1]) then
                      begin
//then it's OK -- pass through the closer, and remove the opener
                        Result := Result + ParseStack[i];
                        TagStack.Delete(TagStack.Count-1);
                      end;
                  end;
//Don't pass through the closer under any other circumstances
              end;
            XMLSelfClosingTag:
              begin
                Result := Result + ParseStack[i];
              end;
            NotXMLTag:
              begin
                Result := Result + EscapeEntities(ParseStack[i]);
              end;
          end; //end case
        end;
//Now if any tags are still open, we have to close them
      if TagStack.Count > 0 then
        for i := TagStack.Count-1 downto 0 do
          Result := Result + MakeCloseTagFromOpenTag(TagStack[i]);
    finally
      FreeAndNil(TagStack);
    end;
  finally
    FreeAndNil(ParseStack);
  end;
end;

function TufrmXMLUtilities.CreateXMLFileWrapper(FilePath, XMLFile,
  XSLFile: WideString): Boolean;
var
RelXMLFile, RelXSLFile: WideString;
DomImpl: TDomImplementation;
DomDoc: TDomDocument;
RootEl: TDomElement;
XSLProc: TDomProcessingInstruction;
DomToXMLParser: TDomToXMLParser;
FileStream: TFileStream;

begin
  Result := False; //default

  try
//Work out the relative path from the save location to the XML file
    RelXMLFile := Tnt_WideStringReplace(WideExtractRelativePath(FilePath, XMLFile),
                                        '\', '/', [rfReplaceAll], False);

//Do the same with the XSL file
    RelXSLFile := Tnt_WideStringReplace(WideExtractRelativePath(FilePath, XSLFile),
                                        '\', '/', [rfReplaceAll], False);

//Create a DOM implementation and document
    DomImpl := TDomImplementation.Create(Self);
    try
      DomDoc := TDomDocument.Create(DomImpl);

//Add the stylesheet processing instruction
      XSLProc := TDomProcessingInstruction.Create(DomDoc, 'xml-stylesheet');
      XSLProc.Data := 'type="text/xsl" href="' + RelXSLFile + '"';
      DomDoc.AppendChild(XSLProc);

//Create the root element
      RootEl := TDomElement.Create(DomDoc, 'docroot');

//Set its href attribute to point to the XML file
      RootEl.SetAttribute('href', RelXMLFile);

      DomDoc.AppendChild(RootEl);

//Create a DOMToXMLParser
      DomToXMLParser := TDomToXMLParser.Create(Self);
      try
        DomToXMLParser.DOMImpl := DomImpl;
        FileStream := TFileStream.Create(FilePath, fmCreate or fmShareExclusive);
        try
//Save the file
          DomToXMLParser.WriteToStream(DomDoc, 'UTF-8', FileStream);
        finally
          FreeAndNil(FileStream);
        end;
      finally
        FreeAndNil(DomToXMLParser);
      end;
//Return true
      Result := True;
    finally
      FreeAndNil(DomImpl);
    end;
  except
    Exit; //returning false
  end;
end;

function TufrmXMLUtilities.MakeXMLNCName(var wsSource: WideString): Boolean;
var
i: Integer;
wsOutput: WideString;

begin
  Result := False; //default
  if Length(wsSource) = 0 then
    Exit;

  wsOutput := '';

  if not (IsXmlLetter(PWideChar(wsSource)^)
          or (PWideChar(wsSource)^ = '_')) then
    wsOutput := '_';    //first position is special; make sure there's a valid char there.

  for i := 1 to Length(wsSource) do
    if IsXmlNCNameChar(wsSource[i]) then
      wsOutput := wsOutput + wsSource[i];
//If we've succeeded
  if IsXMLNCName(wsOutput) then
    begin
      wsSource := wsOutput;
      Result := True;
    end;
end;

function TufrmXMLUtilities.DoXSLTransformation(InXMLDoc,
  InXSLDoc: WideString): WideString;
var
Output: WideString;

begin
//ShowMessage(InXMLDoc + ' / ' + InXSLDoc);

  XMLDoc.LoadFromFile(InXMLDoc);
  XMLDoc.Active := True;
  WideShowMessage(XMLDoc.XML.Text);

  XSLDoc.LoadFromFile(InXSLDoc);
  XSLDoc.Active := True;
  WideShowMessage(XSLDoc.XML.Text);

  XHTMLDoc.Active := True;

  XMLDoc.DocumentElement.TransformNode(XSLDoc.DocumentElement, XHTMLDoc);

  XHTMLDoc.SaveToFile('c:\temp\test\testout.htm');
  Result := 'Done';
end;

function TufrmXMLUtilities.DoXSLTransformationFromFiles(InXMLDocFile,
  InXSLDocFile: WideString): WideString;
begin

end;

function TufrmXMLUtilities.IsWellFormedXMLFragment(InXML: WideString): Boolean;
begin
  Result := IsWellFormedXMLTree('<test>' + InXML + '</test>');
end;

function TufrmXMLUtilities.IsWellFormedXMLTree(InXML: WideString): Boolean;
var
DomImpl: TDomImplementation;
DomDoc: TDomDocument;
NewDomDoc: TDomDocument;
XMLToDomParser: TXMLToDomParser;
XMLInputSource: TXMLInputSource;
Frag: TDomNode;

begin
  Result := True;
  DomImpl := TDomImplementation.Create(nil);
  try
    DomDoc := TDomDocument.Create(DomImpl);
    try
      XMLToDomParser := TXMLToDomParser.Create(nil);
      try
        XMLInputSource := TXMLInputSource.Create(InXML, '', '', 1024, False,
                                                  0, 0, 0, 0, 1);
        try
          try
            XMLToDomParser.DOMImpl := DomDoc.DomImplementation;
  //          Frag := TDomNode.Create(DomDoc);
  //Update to XDOM 3.2 broke this line
  //          Frag := XMLTODomParser.ParseWideString(InXML, '', '', Frag);
  //TODO: THIS CODE IS BROKEN!!!!!!!!!!! It triggers False even when code is OK.
  //Need to use a TDomDocumentFragment, probably.
            {NewDomDoc := XMLToDomParser.WideStringToDom(
            '<?xml version="1.0"?>' + #13#10 + InXML, '', TUTF16LECodec, False);}
            NewDomDoc := XMLToDomParser.Parse(XMLInputSource);
            NewDomDoc.Free;
          except
            on E: EParserException do
              Result := False;
          end;
        finally
          FreeAndNil(XMLInputSource);
        end;
      finally
        FreeAndNil(XMLToDomParser);
      end;
    finally
      FreeAndNil(DomDoc);
    end;
  finally
    FreeAndNil(DomImpl);
  end;
end;

function TufrmXMLUtilities.AddIndentToNode(El: TDomElement;
  DomDoc: TDomDocument; NumSpaces: integer): Boolean;
var
TextNode: TDomText;
Indent: WideString;

begin
  Result := False;
  if El = nil then
    Exit;
  if DomDoc = nil then
    Exit;
  try
    TextNode := TDomText.Create(DomDoc);
    Indent := #10 + StringOfChar(WideChar(' '), NumSpaces);
    TextNode.NodeValue := Indent;
    El.AppendChild(TextNode);
    Result := True;
  except
//Returning false is OK.
  end;
end;

function TufrmXMLUtilities.ParseWideStringToDomNode(InXML: WideString;
                                      DomImpl: TDomImplementation;
                                      TargetDoc: TDomDocument): TDomNode;
var
XMLToDomParser: TXMLToDomParser;
XMLInputSource: TXMLInputSource;
DomDoc: TDomDocument;

begin
  XMLToDomParser := TXMLToDomParser.Create(nil);
  try
    XMLToDomParser.DOMImpl := DomImpl;
    DomDoc := TDomDocument.Create(DomImpl);
    try
      XMLInputSource := TXMLInputSource.Create(InXML, '', '', 4096, False,
                                                  0, 0, 0, 0, 1);
      try
        try
          DomDoc := XMLToDomParser.Parse(XMLInputSource);
          Result := TargetDoc.ImportNode(DomDoc.DocumentElement, True);
        except
          Result := nil;
        end;
      finally
        FreeAndNil(XMLInputSource);
      end;
    finally
      FreeAndNil(DomDoc);
    end;
  finally
    FreeAndNil(XMLToDomParser);
  end;
end;

end.
