網頁正文提取演算法

在 Styletrip 專案內，我們會抓所有景點資料的相關部落格，然後利用部落格的內容做近一步的分析，所以我們需要有能力把抓到的部落格網頁取出正文內容出來。

網頁正文，就是我們平常所看部落格文章最主要的內容區塊，而網頁正文提取的演算法有基於不同的方法來做提取，各方法的準確度都不同，也會因為網頁結構不同而有不一樣的結果。

這邊我是採用 VIPS: Vision based Page Segmentation Algorithm, 這個演算法是基於視覺結構來做判斷，也就是可以想成網頁正文通常都是那個網頁裡面看起來最大的區域。

這邊提供最核心、主要的實做方式：

	public static BlockProperties parseWebContent(Parser parser) throws ParserException {
	NodeList visualBlockNodeList = getVisualBlock(parser);
	// Output.printNodeList( visualBlockNodeList );
	NodeList linkNodeList = findLinkBlock(visualBlockNodeList);
	NodeList invalidNodeList = findInvalidBlock(visualBlockNodeList);
	NodeList actionNodeList = findActionBlock(visualBlockNodeList); // NOTE: 因為動作標籤可能沒有包含文字，所以要獨立出來找

	Map<String, NodeList> blockNodeMap = new HashMap<String, NodeList>();
	blockNodeMap.put(VISUAL_BLOCK, visualBlockNodeList);
	blockNodeMap.put(LINK_BLOCK, linkNodeList);
	blockNodeMap.put(INVALID_BLOCK, invalidNodeList);
	blockNodeMap.put(ACTION_BLOCK, actionNodeList);

	List<BlockProperties> blockPropertiesList = getBlockProperties(blockNodeMap);
	Map<BlockProperties, Double> propMap = new TreeMap<BlockProperties, Double>();
	for (int i = 0; i < blockPropertiesList.size(); i++) {
	BlockProperties blockProp = blockPropertiesList.get(i);

	// /*
	if (blockProp.getProperties().equals(NORMAL_BLOCK) && blockProp.getBlockText().length() <= 0)
	continue;
	// v1: 0.3
	else if ((blockProp.getProperties().equals(LINK_BLOCK) && blockProp.getBlockTextRatio() >= 0.4) \|\|
	blockProp.getSubLinkTextRatio() >= 0.45)
	continue;
	// v1: 0.4
	else if ((blockProp.getProperties().equals(INVALID_BLOCK) && blockProp.getBlockTextRatio() >= 0.65) \|\|
	blockProp.getSubInvalidTextRatio() >= 0.4)
	continue;
	else if (blockProp.getProperties().equals(ACTION_BLOCK) \|\| blockProp.getSubActionBlock() > 0)
	continue;
	// */
	// blockProp.print();

	// 找出正文區塊的計算特徵值公式
	double linkInvalidTextLen = blockProp.getSubInvalidTextLength() + blockProp.getSubLinkTextLength();
	double normalTextLen = blockProp.getBlockText().length() * (1.0 - blockProp.getBlockTextRatio()) - linkInvalidTextLen;
	if (linkInvalidTextLen <= 0)
	linkInvalidTextLen = 1.0; // 為了除法，如果等於零要轉成1

	double linkInvalidSubBlockNum = blockProp.getSubLinkBlock() + blockProp.getSubInvalidBlock();
	double normalSubBlock = (double) (blockProp.getSubBlockNum() - linkInvalidSubBlockNum);
	if (linkInvalidSubBlockNum <= 0)
	linkInvalidSubBlockNum = 1.0;

	double weight = Math.pow(normalTextLen, 5) / (double) blockProp.getBlockText().length();
	weight /= Math.pow(10.0, 5);

	// 子區块都是連結或是無效區块
	if (normalSubBlock == 0 && blockProp.getSubBlockNum() != 0)
	weight /= (10.0 * Math.pow(blockProp.getSubBlockNum(), 2));
	else if (normalSubBlock != 0 && blockProp.getSubBlockNum() != 0)
	weight *= (normalSubBlock / Math.pow(blockProp.getSubBlockNum(), 2));

	if (blockProp.getProperties().equals(NORMAL_BLOCK))
	weight *= 3.0;
	else if (blockProp.getProperties().equals(INVALID_BLOCK))
	weight *= 1.2;
	else if (blockProp.getProperties().equals(LINK_BLOCK))
	weight *= 1.8;

	// 由視覺區块的class或id來判斷，包含article和content的字眼可提高權重值
	CompositeTag blockTag = (CompositeTag) blockProp.getBlockNode();

	String className = blockTag.getAttribute("class");
	String idName = blockTag.getAttribute("id");
	String checkName = null;
	if (idName != null)
	checkName = idName.toLowerCase();
	else if (className != null)
	checkName = className.toLowerCase();

	if (checkName != null) {
	checkName = checkName.toLowerCase().trim();
	if (!(checkName.contains("footer") \|\|
	checkName.contains("header") \|\|
	checkName.contains("counter") \|\|
	checkName.contains("banner")) \|\|
	checkName.contains("widget")) {
	// System.out.println( "*Weight=" + weight );
	if ((checkName.contains("body") &&
	checkName.contains("post")) \|\|
	(checkName.contains("entry") &&
	checkName.contains("content")) \|\|
	checkName.contains("innertext") \|\|
	(checkName.contains("content") &&
	checkName.contains("article")))
	weight *= 1000.0;
	else if (checkName.contains("content"))
	weight *= 50.0;
	else if (checkName.contains("article"))
	weight *= 10.0;
	else if (checkName.contains("text"))
	weight *= 5.0;

	// 有id的再加分
	if (idName != null)
	weight *= 100.0;
	// System.out.println( "Weight'=" + weight );

	}
	}

	propMap.put(blockProp, weight);
	// System.out.println( "\t*Weight=" + weight );
	propMap = MapUtils.sortByValue(propMap, true);
	}

	int count = 0; // 為了取得第一個BlockProperties用的
	BlockProperties contentProp = null;
	int commentIndex = 0; // 用來儲存回應的區塊索引，所有在回應以下的區塊都不能成為正文區块
	for (BlockProperties prop : propMap.keySet()) {
	if (prop.getBlockText().length() <= 0)
	continue;
	try {
	// 用來去掉回應區塊
	CompositeTag propNode = (CompositeTag) prop.getBlockNode();
	String className = propNode.getAttribute("class");
	String idName = propNode.getAttribute("id");
	String checkName = null;
	if (idName != null)
	checkName = idName;
	else if (className != null)
	checkName = className;

	if (checkName != null) {
	if (checkName.contains("comment") \|\| checkName.contains("reply")) {
	commentIndex = blockPropertiesList.indexOf(prop);
	continue;
	} else if (containTrimClassID(propNode))
	continue;
	}
	}
	catch (NullPointerException e) {
	e.printStackTrace();
	}

	if (commentIndex != 0 && blockPropertiesList.indexOf(prop) > commentIndex)
	continue;

	if (count == 0) {
	// System.out.println( "\n\n--> Wegith= " + propMap.get( prop ) ); prop.print();
	contentProp = prop;
	count++;
	} else
	break;
	}

	// 找到正文區塊還有其子區块，判斷是否包含連結區塊或是特定class, id名稱，然後過濾掉。
	Node currentNode = contentProp.getBlockNode();
	String contentHtml =
	org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toHtml(), "\\s+", " ").toLowerCase().trim();

	// currentNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim();
	String contentText = WebCrawler.filterSpecialSymbol(
	org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim());
	// currentNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() );
	// System.out.println( contentText.length() );
	String checkContentHtml = contentHtml; // check開頭的變數是給迴圈判斷正文區塊的子區塊用的
	String checkContentText = contentText; // 因為在迴圈中contentHtml和contentText的字串會變動，所以無法拿來判斷子區塊
	Map<String, Integer> trimTextMap = new TreeMap<String, Integer>();
	for (int i = visualBlockNodeList.indexOf(contentProp.getBlockNode()) + 1;
	i < visualBlockNodeList.size(); i++
	) {
	CompositeTag nextNode = (CompositeTag) visualBlockNodeList.elementAt(i);
	String nextHtml =
	org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toHtml(), "\\s+", " ").toLowerCase().trim();
	// nextNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim();
	String nextText = WebCrawler.filterSpecialSymbol(
	org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim());
	// nextNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ).trim();

	if (checkContentHtml.contains(nextHtml) &&
	checkContentText.contains(nextText)) {
	// System.out.println( "*" + nextHtml );
	// System.out.println( "\t" + nextText );
	// System.out.println( nextText.length() );
	if (containTrimClassID(nextNode)) {
	// 預防過濾掉整個正文字串
	if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) {
	// System.out.printf("\t過濾掉:%s\n", nextText );
	trimTextMap.put(nextHtml, nextText.length());
	}
	}

	if (linkNodeList.contains(nextNode)) {
	int linkTextLen = getLinkTextLength(nextNode);
	double linkTextRatio = (double) linkTextLen / (double) nextText.length();
	if (linkTextRatio >= 0.65) {
	if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) {
	trimTextMap.put(nextHtml, nextText.length());
	// System.out.println( "\t連結區: " + linkTextRatio );
	}
	}
	}

	} else
	break;
	}

	/**
	* 不在上面直接過濾而且是由字串長的過濾到短的是因為
	* 如果要過濾的字串是很短的話，過濾掉後會影響到長字串的過濾
	* ex: ｢我今天去了淡水，還有去淡水老街。｣
	* 先過濾｢淡水｣ ==> ｢我今天去了，還有去老街。｣
	* 再過濾｢淡水老街｣ ==> 無法過濾掉｢淡水老街｣，因為字串剩下｢老街｣
	* 過濾不完全!!
	* 但如果先過濾掉｢淡水老街｣ ==> ｢我今天去了淡水，還有去。｣
	* 在過濾掉｢淡水｣ ==> ｢我今天去了，還有去。｣
	* 完全過濾!!
	*/
	trimTextMap = MapUtils.sortByValue(trimTextMap, true);
	for (String html : trimTextMap.keySet()) {
	// System.out.println( "Filter=" + html );
	contentHtml = contentHtml.replace(html, "");
	}
	contentProp.setBlockHtml(
	org.apache.commons.lang3.StringUtils.replacePattern(
	org.apache.commons.lang3.StringUtils.replacePattern(contentHtml, HtmlTag.BR_REPLACE_REGEX, " "), HtmlTag.STYLE_REPLACE_REGEX, ""));
	return contentProp;
	}

view raw WebContentParser.java hosted with ❤ by GitHub

Demo

擷取選取區域_00432343

	Glynis Dole on Make an Android custom view an…
	Engine Bai / 白昌永（大白） on Android App 開發實戰系列 Part 5. Epo…
	Jaclyn on Make an Android custom view an…
	Jenny on Advanced web scraping in …
	Engine Bai / 白昌永（大白） on Android App 開發實戰系列 Part 5. Epo…

網頁正文提取演算法

Demo

Leave a comment Cancel reply

Follow Me

Recent Posts

Tags