C#Xpath解析HtmlDocument的使⽤⽅法与递归取得页⾯所有标
签xpath值(附源码)
1. //htmlDcoument对象⽤来访问Html⽂档s
2. HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument ( ) ;
3. //加载Html⽂档
4. hd.LoadHtml (strhtml ) ;
5. string str = hd.DocumentNode.SelectSingleNode ( "//*[@id='e_font']" ).OuterHtml ;
这样就可以得到⼀个标签的HTml代码了
OuterHtml是取包含本⾝的Html如果是InnerHtml就是取的包含在这个标签之内的所有Html代码了
这点⼤家要注意了
西湖大学成立
如果⼤家想获取Html代码的Xpath路径就是这部分
1. //*[@id='e_font']
复制代码
这个其实很简单只在⼤家安装⼀个Firbug就⾏了,
看下图⽚
<ignore_js_op>
⼤家只要进⼊选择模式,然后选择你要的内容,然后右键复制⼀下就⾏了。
然后放在SelectSingleNode()⽅法⾥就OK了
下⾯我说说⼏个⽅法和属性的意思吧、
⽅法
SelectNodes 获取的是⼀个集合
SelectSingleNode 获取⼀个标签
SetAttributeValue 设置标签的属性值例如:SetAttributeValue("name","xpath-89");这说明把name属性的值修改为xpath-89
属性
OuterHtml 是取包含本⾝的Html
InnerHtml 取的包含在这个标签之内的所有Html代码了
XPath 获取相对应的Xpath值
Attributes 获取⼀个属性的值例如:Attributes("name")
也可以进⾏添加属性例如:
普通浏览 复制代码
1. hd.DocumentNode.SelectSingleNode (item.Key ).Attributes.Add ( "xpathid", "xpath_1" ) ;
下⾯我写了⼀个递归获取Html页⾯所有Xpath值的⽅法⼤家看⼀下吧
普通浏览 复制代码
1. //key(Xpath),value(整个节点)
2. public List<ObjXpath> XpathList = new List<ObjXpath> ( ) ;
3. public string strhtml = "" ; //这⾥就是你的Html代码具体怎么获取请参考我的<a href=\"www.sufeinet/thread-
3-1-1.html\" target=\"_blank\">HttpHelper</a>类吧
4. private int Index = 0 ;
5. //开始处理Node
6. private void SartNode ( )
7. {
8. //htmlDcoument对象⽤来访问Html⽂档s
9. HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument ( ) ;
10. //加载Html⽂档
11. hd.LoadHtml (strhtml ) ;
12. HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes ;
13. Index = 0 ;
14. XpathList.Clear ( ) ;
15. foreach (HtmlNode em in htmllist )
16. {
17. Setxpath (em ) ;
18. }
19. }
20. /// <summary>
21. /// 递归获取Html Dom
22. /// </summary>
23. /// <param name="node">要处理的节点</param>
24. private void Setxpath (HtmlNode node )
25. {
26. foreach (HtmlNode item in node.ChildNodes )
27. {
28. if (item. XPath.Contains ( "#" ) )
29. {
30. continue ;
31. }
32. if (item.ChildNodes.Count > 0 )
33. {
34. XpathList.Add ( new ObjXpath ( ) { id = Index.ToString ( ), Key = item. XPath, Value = "" } ) ;
35. Index++ ;
36. Setxpath (item ) ;
37. }
38. else
39. {
40. XpathList.Add ( new ObjXpath ( ) { id = Index.ToString ( ), Key = item. XPath, Value = "" } ) ;
41. Index++ ;
42. }
43. }
44. }
45. public class ObjXpath
46. {
47. public string id { get ; set ; }
48. public string Key { get ; set ; }
49. public string Value { get ; set ; }
50. }
XpathList 就是获取的所有Xpath值了,⼤家有兴趣的话可以试试
我们先来看看效果吧付辛博井柏然
<ignore_js_op>
王思聪调侃杨超越好了下⾯放出所有代码给⼤家
普通浏览 复制代码
1. using System ;
2. using System. Collections. Generic ;
3. using System. ComponentModel ;
4. using System. Data ;
5. using System. Drawing ;
6. using System.Linq ;
7. using System. Text ;
8. using System.Windows.Forms ;
9. using System. Text. RegularExpressions ;
10. using System. Threading ;
11. using HtmlAgilityPack ;
12. using System. IO ;
13. using System. Runtime. Serialization.Json ;
14.
15. namespace AutoXpathTools
16. {
17. public partial class Form1 : Form
18. {
19. public Form1 ( )
20. {
21. InitializeComponent ( ) ;
22. }
23.
24. #region 私有变量和⽅法
25.
怎么改变图片大小26. //委托传⼊⼀个字符串
27. private delegate void SetListBox ( string str ) ;
28.
29. //key(Xpath),value(整个节点)
30. List<ObjXpath> XpathList = new List<ObjXpath> ( ) ;最新刑侦电视剧
学校综合治理工作总结
31. private int Index = 0 ;
32. //htmlDcoument对象⽤来访问Html⽂档
33. HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument ( ) ;
34.
35. #endregion
36.
37. //分析Xpath的所有代码
38. private void btnGetXpath_Click ( object sender, EventArgs e )
39. {
40. try
41. {
42. HttpHelper http = new HttpHelper ( ) ;
43. HttpItem item = new HttpItem ( ) { URL = textBox1. Text.Trim ( ), IsToLower = false, Encoding = "gbk" } ;
44. txtXml. Text = http.GetHtml (item ) ;
45. if (! string.IsNullOrWhiteSpace (txtXml. Text ) && txtXml. Text.Trim ( ).ToLower ( ) != "error" )
46. {
47. //加载Html⽂档
48. hd.LoadHtml (txtXml. Text ) ;
49.
50.
51. Thread pingTask = new Thread ( new ThreadStart ( delegate
52. {
53. //代码,线程要执⾏的代码
54. SartNode (txtXml. Text ) ;
55. } ) ) ;
56. pingTask.Start ( ) ;
57.
58. }
59. else
60. {
61. txtXml. Text = "根据您的的ULR:" + textBox1. Text.Trim ( ) + "⽆法得到任何内容" ;
62. }
63. }
64. catch (Exception ex )
65. {
66. txtXml. Text = ex.Message.Trim ( ) ;
67. }
68. }
69.
70.
71. //开始处理Node
72. private void SartNode ( string strhtml )
73. {
74. //htmlDcoument对象⽤来访问Html⽂档s
75. HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument ( ) ;
76. //加载Html⽂档
77. hd.LoadHtml (strhtml ) ;
78. HtmlNodeCollection htmllist = hd.DocumentNode.ChildNodes ;
79. Index = 0 ;
80. XpathList.Clear ( ) ;
81. foreach (HtmlNode em in htmllist )
82. {
83. Setxpath (em ) ;
84. }
85. }
86. /// <summary>
87. /// 递归获取Html Dom
88. /// </summary>
89. /// <param name="node">要处理的节点</param>
90. private void Setxpath (HtmlNode node )
91. {
92. foreach (HtmlNode item in node.ChildNodes )
93. {
94. if (item. XPath.Contains ( "#" ) )
95. {
96. continue ;
97. }
发布评论