我正在尝试在任何HTML元标记中获取charset属性 . (即 < meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" >
)有没有办法在Linux下的C中做到这一点 . 我使用HTML整理作为解析器,但我无法获得该属性以返回与us-ascii不同的任何内容(即使编码为utf-8)
这是我得到的输出:* . * 4 Node:meta
名称attr:http-equiv
值attr:Content-Type
名称attr:内容
值attr:text / html;字符集= US-ASCII
1 回答
根据Vinko Vrsalovic的要求,以下是获得该结果的代码:
void dumpNode(TidyNode tnod,int indent)
{
TidyNode孩子;
for(child = tidyGetChild(tnod); child; child = tidyGetNext(child))
{
ctmbstr名称;
switch(tidyNodeGetType(child))
{
case TidyNode_Root:name = "Root";打破;
case TidyNode_DocType:name = "DOCTYPE";打破;
case TidyNode_Comment:name = "Comment";打破;
case TidyNode_ProcIns:name = "Processing Instruction";打破;
case TidyNode_Text:name = "Text";打破;
case TidyNode_CDATA:name = "CDATA";打破;
case TidyNode_Section:name = "XML Section";打破;
case TidyNode_Asp:name = "ASP";打破;
case TidyNode_Jste:name = "JSTE";打破;
case TidyNode_Php:name = "PHP";打破;
case TidyNode_XmlDecl:name = "XML Declaration";打破;
case TidyNode_Start:
case TidyNode_End:
case TidyNode_StartEnd:
默认:
name = tidyNodeGetName(child);
TidyAttr att = tidyAttrFirst(孩子);
而(att)
{
std :: cout <"Name attr: " << tidyAttrName(att)<< std :: endl;
std :: cout <"Value attr:" << tidyAttrValue(att)<< std :: endl;
att = tidyAttrNext(att);
}
打破;
}
assert(name!= NULL);
printf("%d*.*%d%sNode: %s\n",缩进,缩进," ",名称);
dumpNode(child,indent 4);
}
}
void dumpHtml(TidyDoc tdoc)
{
dumpNode(tidyGetHtml(tdoc),0);
}
int main(int argc,char ** argv){
std :: string toReturn(“”);
TidyBuffer输出;
TidyBuffer errbuf;
int rc = -1;
Bool好的;
tidyBufInit(输出);
tidyBufInit(&errbuf);
TidyDoc tdoc = tidyCreate();
ok = tidyOptSetBool(tdoc,TidyXhtmlOut,yes); //转换为XHTML
如果(好的)
rc = tidySetErrorBuffer(tdoc,&errbuf); //捕获诊断
if(rc> = 0)
rc = tidyParseFile(tdoc,"fuebuena.html"); //解析输入
if(rc> = 0)
rc = tidyCleanAndRepair(tdoc); //收拾它!
if(rc> = 0)
dumpHtml(tdoc);
返回0;
}