在 Styletrip 專案內,我們會抓所有景點資料的相關部落格,然後利用部落格的內容做近一步的分析,所以我們需要有能力把抓到的部落格網頁取出正文內容出來。

網頁正文,就是我們平常所看部落格文章最主要的內容區塊,而網頁正文提取的演算法有基於不同的方法來做提取,各方法的準確度都不同,也會因為網頁結構不同而有不一樣的結果。
這邊我是採用 VIPS: Vision based Page Segmentation Algorithm, 這個演算法是基於視覺結構來做判斷,也就是可以想成網頁正文通常都是那個網頁裡面看起來最大的區域。
這邊提供最核心、主要的實做方式:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public static BlockProperties parseWebContent(Parser parser) throws ParserException { | |
| NodeList visualBlockNodeList = getVisualBlock(parser); | |
| // Output.printNodeList( visualBlockNodeList ); | |
| NodeList linkNodeList = findLinkBlock(visualBlockNodeList); | |
| NodeList invalidNodeList = findInvalidBlock(visualBlockNodeList); | |
| NodeList actionNodeList = findActionBlock(visualBlockNodeList); // NOTE: 因為動作標籤可能沒有包含文字,所以要獨立出來找 | |
| Map<String, NodeList> blockNodeMap = new HashMap<String, NodeList>(); | |
| blockNodeMap.put(VISUAL_BLOCK, visualBlockNodeList); | |
| blockNodeMap.put(LINK_BLOCK, linkNodeList); | |
| blockNodeMap.put(INVALID_BLOCK, invalidNodeList); | |
| blockNodeMap.put(ACTION_BLOCK, actionNodeList); | |
| List<BlockProperties> blockPropertiesList = getBlockProperties(blockNodeMap); | |
| Map<BlockProperties, Double> propMap = new TreeMap<BlockProperties, Double>(); | |
| for (int i = 0; i < blockPropertiesList.size(); i++) { | |
| BlockProperties blockProp = blockPropertiesList.get(i); | |
| // /* | |
| if (blockProp.getProperties().equals(NORMAL_BLOCK) && blockProp.getBlockText().length() <= 0) | |
| continue; | |
| // v1: 0.3 | |
| else if ((blockProp.getProperties().equals(LINK_BLOCK) && blockProp.getBlockTextRatio() >= 0.4) || | |
| blockProp.getSubLinkTextRatio() >= 0.45) | |
| continue; | |
| // v1: 0.4 | |
| else if ((blockProp.getProperties().equals(INVALID_BLOCK) && blockProp.getBlockTextRatio() >= 0.65) || | |
| blockProp.getSubInvalidTextRatio() >= 0.4) | |
| continue; | |
| else if (blockProp.getProperties().equals(ACTION_BLOCK) || blockProp.getSubActionBlock() > 0) | |
| continue; | |
| // */ | |
| // blockProp.print(); | |
| // 找出正文區塊的計算特徵值公式 | |
| double linkInvalidTextLen = blockProp.getSubInvalidTextLength() + blockProp.getSubLinkTextLength(); | |
| double normalTextLen = blockProp.getBlockText().length() * (1.0 - blockProp.getBlockTextRatio()) - linkInvalidTextLen; | |
| if (linkInvalidTextLen <= 0) | |
| linkInvalidTextLen = 1.0; // 為了除法,如果等於零要轉成1 | |
| double linkInvalidSubBlockNum = blockProp.getSubLinkBlock() + blockProp.getSubInvalidBlock(); | |
| double normalSubBlock = (double) (blockProp.getSubBlockNum() - linkInvalidSubBlockNum); | |
| if (linkInvalidSubBlockNum <= 0) | |
| linkInvalidSubBlockNum = 1.0; | |
| double weight = Math.pow(normalTextLen, 5) / (double) blockProp.getBlockText().length(); | |
| weight /= Math.pow(10.0, 5); | |
| // 子區块都是連結或是無效區块 | |
| if (normalSubBlock == 0 && blockProp.getSubBlockNum() != 0) | |
| weight /= (10.0 * Math.pow(blockProp.getSubBlockNum(), 2)); | |
| else if (normalSubBlock != 0 && blockProp.getSubBlockNum() != 0) | |
| weight *= (normalSubBlock / Math.pow(blockProp.getSubBlockNum(), 2)); | |
| if (blockProp.getProperties().equals(NORMAL_BLOCK)) | |
| weight *= 3.0; | |
| else if (blockProp.getProperties().equals(INVALID_BLOCK)) | |
| weight *= 1.2; | |
| else if (blockProp.getProperties().equals(LINK_BLOCK)) | |
| weight *= 1.8; | |
| // 由視覺區块的class或id來判斷,包含article和content的字眼可提高權重值 | |
| CompositeTag blockTag = (CompositeTag) blockProp.getBlockNode(); | |
| String className = blockTag.getAttribute("class"); | |
| String idName = blockTag.getAttribute("id"); | |
| String checkName = null; | |
| if (idName != null) | |
| checkName = idName.toLowerCase(); | |
| else if (className != null) | |
| checkName = className.toLowerCase(); | |
| if (checkName != null) { | |
| checkName = checkName.toLowerCase().trim(); | |
| if (!(checkName.contains("footer") || | |
| checkName.contains("header") || | |
| checkName.contains("counter") || | |
| checkName.contains("banner")) || | |
| checkName.contains("widget")) { | |
| // System.out.println( "*Weight=" + weight ); | |
| if ((checkName.contains("body") && | |
| checkName.contains("post")) || | |
| (checkName.contains("entry") && | |
| checkName.contains("content")) || | |
| checkName.contains("innertext") || | |
| (checkName.contains("content") && | |
| checkName.contains("article"))) | |
| weight *= 1000.0; | |
| else if (checkName.contains("content")) | |
| weight *= 50.0; | |
| else if (checkName.contains("article")) | |
| weight *= 10.0; | |
| else if (checkName.contains("text")) | |
| weight *= 5.0; | |
| // 有id的再加分 | |
| if (idName != null) | |
| weight *= 100.0; | |
| // System.out.println( "Weight'=" + weight ); | |
| } | |
| } | |
| propMap.put(blockProp, weight); | |
| // System.out.println( "\t*Weight=" + weight ); | |
| propMap = MapUtils.sortByValue(propMap, true); | |
| } | |
| int count = 0; // 為了取得第一個BlockProperties用的 | |
| BlockProperties contentProp = null; | |
| int commentIndex = 0; // 用來儲存回應的區塊索引,所有在回應以下的區塊都不能成為正文區块 | |
| for (BlockProperties prop : propMap.keySet()) { | |
| if (prop.getBlockText().length() <= 0) | |
| continue; | |
| try { | |
| // 用來去掉回應區塊 | |
| CompositeTag propNode = (CompositeTag) prop.getBlockNode(); | |
| String className = propNode.getAttribute("class"); | |
| String idName = propNode.getAttribute("id"); | |
| String checkName = null; | |
| if (idName != null) | |
| checkName = idName; | |
| else if (className != null) | |
| checkName = className; | |
| if (checkName != null) { | |
| if (checkName.contains("comment") || checkName.contains("reply")) { | |
| commentIndex = blockPropertiesList.indexOf(prop); | |
| continue; | |
| } else if (containTrimClassID(propNode)) | |
| continue; | |
| } | |
| } | |
| catch (NullPointerException e) { | |
| e.printStackTrace(); | |
| } | |
| if (commentIndex != 0 && blockPropertiesList.indexOf(prop) > commentIndex) | |
| continue; | |
| if (count == 0) { | |
| // System.out.println( "\n\n--> Wegith= " + propMap.get( prop ) ); prop.print(); | |
| contentProp = prop; | |
| count++; | |
| } else | |
| break; | |
| } | |
| // 找到正文區塊還有其子區块,判斷是否包含連結區塊或是特定class, id名稱,然後過濾掉。 | |
| Node currentNode = contentProp.getBlockNode(); | |
| String contentHtml = | |
| org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toHtml(), "\\s+", " ").toLowerCase().trim(); | |
| // currentNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim(); | |
| String contentText = WebCrawler.filterSpecialSymbol( | |
| org.apache.commons.lang3.StringUtils.replacePattern(currentNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim()); | |
| // currentNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ); | |
| // System.out.println( contentText.length() ); | |
| String checkContentHtml = contentHtml; // check開頭的變數是給迴圈判斷正文區塊的子區塊用的 | |
| String checkContentText = contentText; // 因為在迴圈中contentHtml和contentText的字串會變動,所以無法拿來判斷子區塊 | |
| Map<String, Integer> trimTextMap = new TreeMap<String, Integer>(); | |
| for (int i = visualBlockNodeList.indexOf(contentProp.getBlockNode()) + 1; | |
| i < visualBlockNodeList.size(); i++ | |
| ) { | |
| CompositeTag nextNode = (CompositeTag) visualBlockNodeList.elementAt(i); | |
| String nextHtml = | |
| org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toHtml(), "\\s+", " ").toLowerCase().trim(); | |
| // nextNode.toHtml().replaceAll( "\\s+", " " ).toLowerCase().trim(); | |
| String nextText = WebCrawler.filterSpecialSymbol( | |
| org.apache.commons.lang3.StringUtils.replacePattern(nextNode.toPlainTextString(), "\\s+", " ").toLowerCase().trim()); | |
| // nextNode.toPlainTextString().replaceAll( "\\s+", " " ).trim() ).trim(); | |
| if (checkContentHtml.contains(nextHtml) && | |
| checkContentText.contains(nextText)) { | |
| // System.out.println( "*" + nextHtml ); | |
| // System.out.println( "\t" + nextText ); | |
| // System.out.println( nextText.length() ); | |
| if (containTrimClassID(nextNode)) { | |
| // 預防過濾掉整個正文字串 | |
| if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) { | |
| // System.out.printf("\t過濾掉:%s\n", nextText ); | |
| trimTextMap.put(nextHtml, nextText.length()); | |
| } | |
| } | |
| if (linkNodeList.contains(nextNode)) { | |
| int linkTextLen = getLinkTextLength(nextNode); | |
| double linkTextRatio = (double) linkTextLen / (double) nextText.length(); | |
| if (linkTextRatio >= 0.65) { | |
| if (nextText.length() < (double) contentText.length() * (2.0 / 3.0)) { | |
| trimTextMap.put(nextHtml, nextText.length()); | |
| // System.out.println( "\t連結區: " + linkTextRatio ); | |
| } | |
| } | |
| } | |
| } else | |
| break; | |
| } | |
| /** | |
| * 不在上面直接過濾而且是由字串長的過濾到短的是因為 | |
| * 如果要過濾的字串是很短的話,過濾掉後會影響到長字串的過濾 | |
| * ex: 「我今天去了淡水,還有去淡水老街。」 | |
| * 先過濾「淡水」 ==> 「我今天去了,還有去老街。」 | |
| * 再過濾「淡水老街」 ==> 無法過濾掉「淡水老街」,因為字串剩下「老街」 | |
| * 過濾不完全!! | |
| * 但如果先過濾掉「淡水老街」 ==> 「我今天去了淡水,還有去。」 | |
| * 在過濾掉「淡水」 ==> 「我今天去了,還有去。」 | |
| * 完全過濾!! | |
| */ | |
| trimTextMap = MapUtils.sortByValue(trimTextMap, true); | |
| for (String html : trimTextMap.keySet()) { | |
| // System.out.println( "Filter=" + html ); | |
| contentHtml = contentHtml.replace(html, ""); | |
| } | |
| contentProp.setBlockHtml( | |
| org.apache.commons.lang3.StringUtils.replacePattern( | |
| org.apache.commons.lang3.StringUtils.replacePattern(contentHtml, HtmlTag.BR_REPLACE_REGEX, " "), HtmlTag.STYLE_REPLACE_REGEX, "")); | |
| return contentProp; | |
| } |
Demo

Leave a comment