Code/Resource
Windows Develop
Linux-Unix program
Internet-Socket-Network
Web Server
Browser Client
Ftp Server
Ftp Client
Browser Plugins
Proxy Server
Email Server
Email Client
WEB Mail
Firewall-Security
Telnet Server
Telnet Client
ICQ-IM-Chat
Search Engine
Sniffer Package capture
Remote Control
xml-soap-webservice
P2P
WEB(ASP,PHP,...)
TCP/IP Stack
SNMP
Grid Computing
SilverLight
DNS
Cluster Service
Network Security
Communication-Mobile
Game Program
Editor
Multimedia program
Graph program
Compiler program
Compress-Decompress algrithms
Crypt_Decrypt algrithms
Mathimatics-Numerical algorithms
MultiLanguage
Disk/Storage
Java Develop
assembly language
Applications
Other systems
Database system
Embeded-SCM Develop
FlashMX/Flex
source in ebook
Delphi VCL
OS Develop
MiddleWare
MPI
MacOS develop
LabView
ELanguage
Software/Tools
E-Books
Artical/Document
HtmlParserGetPageUrls.java
Package: LuceneAndNuch_Ch1-10.rar [view]
Upload User: cctqzzy
Upload Date: 2022-03-14
Package Size: 12198k
Code Size: 2k
Category:
Search Engine
Development Platform:
Java
- package chapter9;
- import org.htmlparser.util.*;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.*;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.nodes.TextNode;
- import org.htmlparser.lexer.*;
- import org.htmlparser.lexer.Stream;
- import org.htmlparser.Node;
- import java.io.*;
- import java.net.*;
- import org.htmlparser.http.ConnectionManager;
- import org.htmlparser.visitors.TextExtractingVisitor;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.filters.HasSiblingFilter;
- import org.htmlparser.util.ParserException;
- public class HtmlParserGetPageUrls {
- public static void main (String[] args) throws ParserException
- {
- try {
- getHtmlUrls("http://www.bnu.edu.cn/","GB2312");
- } catch(ParserException e)
- {
- e.printStackTrace();
- }
- }
- public static void getHtmlUrls(String url , String pageEncoding) throws ParserException
- {
- NodeList nodeList = null;
- try {
- Parser parser = new Parser(url);
- parser.setEncoding(pageEncoding); // 设置解析编码格式
- // 可以使用下面filter来取出url连接
- //nodeList = parser.parse(new TagNameFilter("A")); // 使用TagNameFilter
- nodeList = parser.parse(new NodeClassFilter(LinkTag.class)); // 使用NodeClassFilter
- } catch (ParserException e) {
- e.printStackTrace();
- }
- if(nodeList != null && nodeList.size() > 0) { // 循环遍历每个Url节点
- for(int i = 0; i < nodeList.size(); i ++) {
- String urlLink = ((LinkTag)nodeList.elementAt(i)).extractLink();
- String LinkName = ((LinkTag)nodeList.elementAt(i)).getLinkText();
- if( urlLink.indexOf("bnu") == 0 || urlLink.indexOf("http") == 0 )
- //System.out.println(LinkName +" : "+ urlLink);
- System.out.println( urlLink );
- }
- }
- }
- }