heritrix爬取友人网（http://mobile.younet.com/）信息后遇到的问题

chx2176 2010-05-16

最近在使用heritrix爬取了http://mobile.younet.com/网站的网站产品页面后，在运行写入main函数的 Extractor后，控制台并没有出现所想要的信息，只有count输出为0 的信息，我由于初学实在是解决不出来，贴出我用的两个类Extractor和ExtractYounetMobile希望大家能帮我找找是什么原因了

package com.backSearch.extractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Parser;

import com.backSearch.extractor.younet.ExtractYounetMobile;

public abstract class Extractor {

protected static final String NEWLINE = "\r\n";

/**
* 表示所有结果的输出路径
*/
private String outputPath = "";

/**
* 表示当前正在被处理的文件
*/
private String inputFilePath;

/**
* 表示当前所有被抓取的网页的镜象根目录在Heritrix用mirror目录表示
*/
private String mirrorDir = "";

/**
* 用于存放被处理过后的产口的图片的目录
*/
private String imageDir = "";

/**
* HTMLParser的实例
*/
private Parser parser;

/**
* 对图片路径进行哈希的算法，这里采用MD5算法
*/
protected static final String HASH_ALGORITHM = "md5";

/**
* 分隔符
*/
public static final String SEPARATOR = "======================";

/**
* 装载需要的网页文件
*
*/
public void loadFile(String path) {
try {
parser = new Parser(path);
inputFilePath = path;
parser.setEncoding("UTF-8");
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* 获取输出的路径
*/
public String getOutputPath() {
return outputPath;
}

/**
* 设置输出的路径，通常在初始化Extractor时就应该做
*/
public void setOutputPath(String outputPath) {
this.outputPath = outputPath;
}

public Parser getParser() {
return parser;
}

/**
* 使用正则来匹配并获得网页中的字符串
*/
protected String getProp(String pattern, String match, int index) {
Pattern sp = Pattern.compile(pattern);
Matcher matcher = sp.matcher(match);
while (matcher.find()) {
return matcher.group(index);
}
return null;
}

/**
* 抽象方法，用于供子类实现。其功能主要是解释网页文件将产品信息保存到
*
*/
public abstract void extract();

/**
* 获取正在处理的文件的路径
*/
public String getInputFilePath() {
return inputFilePath;
}

/**
* 从mirror目录下拷贝文件至所设定的图片目录
* 该方法可能需要被改变
*/
protected boolean copyImage(String image_url, String new_image_file) {

String dirs = image_url.substring(7);

try {
// instance the File as file_in and file_out
File file_in = new File(new File(mirrorDir), dirs);
if (file_in == null || !file_in.exists()) {
file_in = new File("f:\\sousuo\\noimage.jpg");
}

File file_out = new File(new File(imageDir), new_image_file);

FileInputStream in1 = new FileInputStream(file_in);
FileOutputStream out1 = new FileOutputStream(file_out);

byte[] bytes = new byte[1024];
int c;
while ((c = in1.read(bytes)) != -1)
out1.write(bytes, 0, c);

// close
in1.close();
out1.close();
return (true); // if success then return true
} catch (Exception e) {
e.printStackTrace();
return (false); // if fail then return false
}
}

public String getImageDir() {
return imageDir;
}

public void setImageDir(String imageDir) {
this.imageDir = imageDir;
}

public String getMirrorDir() {
return mirrorDir;
}

public void setMirrorDir(String mirrorDir) {
this.mirrorDir = mirrorDir;
}

public void setInputFilePath(String inputFilePath) {
this.inputFilePath = inputFilePath;
}

// public static void main(String[] args) throws Exception {
//
// Extractor extractor = new Extract163Moblie();
// extractor.setOutputPath("c:\\product\\test\\mobile\\");
// extractor.setImageDir("c:\\product\\test\\image\\");
// extractor.setMirrorDir("F:\\data\\163手机\\mirror\\");
//
// traverse(extractor, new File("F:\\data\\163手机\\mirror\\mobile.163.com\\0011\\product\\0011000B\\product"));
// System.out.println(count);
//
// }
static int count = 0;

public static void main(String[] args) throws Exception {

Extractor extractor = new ExtractYounetMobile();
extractor.setOutputPath("F:\\product\\mobile\\");
extractor.setImageDir("F:\\product\\image\\");
extractor.setMirrorDir("F:\\learn\\Workspaces\\MyEclipse 7.0\\heritrixProject_1\\jobs\\YounetMobile-20100514064948846\\mirror\\");

//try {
//long s = System.currentTimeMillis();
traverse(extractor, new File("F:\\learn\\Workspaces\\MyEclipse 7.0\\heritrixProject_1\\jobs\\YounetMobile-20100514064948846\\mirror\\mobile.younet.com\\files\\"));
//long e = System.currentTimeMillis();
//System.out.println("1---------------------" + e);
//System.out.println("2---------------------" + s);
//System.out.println("用时： " + (e - s) / 1000 + " 秒");
System.out.println("总数" + count);
// } catch (Exception e) {
// e.printStackTrace();
//}

}

public static void traverse(Extractor extractor, File path)
throws Exception {
if (path == null) {
return;
}

if (path.isDirectory()) {
String[] files = path.list();
for (int i = 0; i < files.length; i++) {
traverse(extractor, new File(path, files[i]));
}
} else {
if (path.getAbsolutePath().endsWith(".html")
&& path.getAbsolutePath().indexOf("_") == -1) {
System.out.println(path);
count++;
extractor.loadFile(path.getAbsolutePath());
extractor.extract();
}
}
}

}

package com.backSearch.extractor.younet;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import com.backSearch.extractor.Extractor;
import com.backSearch.util.StringUtils;

public class ExtractYounetMobile extends Extractor {

@Override
public void extract() {
BufferedWriter bw = null;
NodeFilter title_filter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mo_tit"));
NodeFilter attribute_filter = new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"))));
NodeFilter img_filter = new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));

//提取标题信息
try {
//Parser根据过滤器返回所有满足过滤条件的节点
// 迭代逐渐查找
    NodeList nodeList=this.getParser().parse(title_filter);
NodeIterator it = nodeList.elements();
StringBuffer title = new StringBuffer();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
String[] names = node.toPlainTextString().split(" ");
for(int i = 0; i < names.length; i++)
title.append(names[i]).append("-");
title.append(new Date().getTime());
//创建要生成的文件
bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
//获取当前提取页的完整URL地址
int startPos = this.getInputFilePath().indexOf("mirror") + 6;
String url_seg = this.getInputFilePath().substring(startPos);
url_seg = url_seg.replaceAll("\\\\", "/");
String url = "http:/" + url_seg;
//写入当前提取页的完整URL地址
bw.write(url + NEWLINE);
bw.write(names[0] + NEWLINE);
bw.write(names[1] + NEWLINE);

}
// 重置Parser
this.getParser().reset();
Parser attNameParser = null;
Parser attValueParser = null;
            //Parser parser=new Parser("http://www.sina.com.cn");
NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"));
    NodeFilter attributeValue_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp2"));
            String attName = "";
            String attValue = "";
            // 迭代逐渐查找
    nodeList=this.getParser().parse(attribute_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
attNameParser = new Parser();
attNameParser.setEncoding("UTF-8");
attNameParser.setInputHTML(node.toHtml());
NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
attName = attNameNodeList.elements().nextNode().toPlainTextString();

attValueParser = new Parser();
attValueParser.setEncoding("UTF-8");
attValueParser.setInputHTML(node.toHtml());
NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
attValue = attValueNodeList.elements().nextNode().toPlainTextString();
bw.write(attName.trim() + attValue.trim());
bw.newLine();
}
// 重置Parser
this.getParser().reset();
String imgUrl = "";
String fileType ="";
// 迭代逐渐查找
    nodeList=this.getParser().parse(img_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();

ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
imgUrl = imgNode.getAttribute("src");
fileType = imgUrl.trim().substring(imgUrl
.lastIndexOf(".") + 1);
//生成新的图片的文件名
String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
//imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
//利用miorr目录下的图片生成的新的图片
this.copyImage(imgUrl, new_iamge_file);
bw.write(SEPARATOR + NEWLINE);
bw.write(new_iamge_file + NEWLINE);
}


        } catch(Exception e) {
            e.printStackTrace();
        } finally {
        try{
    if (bw != null)
    bw.close();
    }catch(IOException e){
    e.printStackTrace();
    }
        }

}
}

我是在heritrix里面写了一个MobileYounetExtractor 正则表达式选定
“http://mobile.younet.com/choose.php?groupid=1,2,3,4,&tradeid=[\\d]+,& ”
来抓取该网站下的各种手机型号的页面和相关图片。
希望大家能给我点儿帮助，支持邮箱及QQ联系。谢谢大家

chx2176 2010-05-16

果然是路径问题，我把路径F:\learn\Workspaces\MyEclipse 7.0 \heritrixProject_1\jobs\下的YounetMobile-20100514064948846移到F盘根目录下，控制台正常的打印出了各手机品牌页面。不过在我前面制定的F：\product xia de mobile 和image下什么都没有。。。明天研究。。。要熄灯了

发表回复

>>返回群组首页

heritrix爬取友人网（http://mobile.younet.com/）信息后遇到的问题

相关讨论

相关资源推荐