用户:Wikibot:修订间差异
无编辑摘要 |
无编辑摘要 |
||
第1,554行: | 第1,554行: | ||
while (true) { | while (true) { | ||
try { | try { | ||
Thread | Thread.sleep(1000); | ||
} catch (InterruptedException ex) { | } catch (InterruptedException ex) { | ||
ex.printStackTrace(); | ex.printStackTrace(); |
2011年5月19日 (四) 17:43的最新版本
机器人,自动将 http://help.ubuntu.com 和 http://wiki.ubuntu.com 由 monimoni 格式转换到 mediawiki 格式,并自动更新和发布的小程序。 由java写成。
<source lang="java">
/*
* Main.java * * Created on 2007年5月12日, 下午1:31 * * To change this template, choose Tools | Template Manager * and open the template in the editor. */
package wiki;
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.security.GeneralSecurityException; import java.security.cert.X509Certificate; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.X509TrustManager;
/**
* * @author oneleaf */
public class Main {
List<String> addDict = new ArrayList<String>(); List<String> oldDict = new ArrayList<String>(); String cookie = "NEED INPUT"; int readOneDictPayTime = 2500;
public String getHtmlFromStream(InputStream in) throws IOException { StringBuffer html = new StringBuffer(); int no = 0; while ((no = in.read()) > -1) { html.append((char) no); } in.close(); return new String(html.toString().getBytes("iso8859-1"), "utf-8"); }
private void addDict(String dict) { String str = dict.trim();
if (dict.startsWith("/")) { str = dict.substring(1); } if (dict.indexOf("#") > 0) { str = dict.substring(0, dict.indexOf("#")); } if (dict.indexOf("?") > 0) { str = dict.substring(0, dict.indexOf("?")); } if (dict.startsWith("./") || dict.startsWith("//")) { str = dict.substring(2); } if (dict.startsWith("../")) { str = dict.substring(3); } if (dict.endsWith(".") || dict.endsWith(")") || dict.endsWith("]") || dict.endsWith("}") || dict.endsWith(";") || dict.endsWith("+") || dict.endsWith("'") || dict.endsWith("\"") || dict.endsWith("/") || dict.endsWith(":") || dict.endsWith(",") || dict.endsWith(">")) { str = dict.substring(0, str.length() - 1); }
// if (str.toLowerCase().indexOf("team")>0) return;
if (str.trim().length() == 0) { return; } if (str.trim().length() >= 256) { return; //../CommandLine //Community => community } if ("Community".equals(str)) { return; } if (oldDict.contains(str)) { return; } if (addDict.contains(str)) { return; } addDict.add(str); }
private void delDict(int dictindex) { oldDict.add(addDict.get(dictindex)); addDict.remove(dictindex); }
private void delDict(String dict) { oldDict.add(dict); addDict.remove(dict); }
private void clearDict() { addDict.clear(); oldDict.clear(); }
private boolean findDict(String dict) { for (String d : addDict) { if (d.equals(dict)) { return true; } } for (String d : oldDict) { if (d.equals(dict)) { return true; } } return false; }
private void getDicts(String html) { Pattern pattern = Pattern.compile("\\[UbuntuHelp:(.*?)\\]"); Matcher matcher = pattern.matcher(html); while (matcher.find()) { String line = matcher.group(1); if (line.indexOf("|") > 0) { addDict(line.substring(0, line.indexOf("|"))); } else { addDict(line); } } pattern = Pattern.compile("\\[UbuntuWiki:(.*?)\\]"); matcher = pattern.matcher(html); while (matcher.find()) { String line = matcher.group(1); if (line.indexOf("|") > 0) { addDict(line.substring(0, line.indexOf("|"))); } else { addDict(line); } } }
/** Creates a new instance of Main */ public Main() { SSLContext sslContext = null; try { sslContext = SSLContext.getInstance("TLS"); X509TrustManager[] xtmArray = new X509TrustManager[]{xtm}; sslContext.init(null, xtmArray, new java.security.SecureRandom()); } catch (GeneralSecurityException gse) { } if (sslContext != null) { HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory()); } HttpsURLConnection.setDefaultHostnameVerifier(hnv); } private X509TrustManager xtm = new X509TrustManager() {
@Override public void checkClientTrusted(X509Certificate[] chain, String authType) { }
@Override public void checkServerTrusted(X509Certificate[] chain, String authType) { }
@Override public X509Certificate[] getAcceptedIssuers() { return null; } }; private HostnameVerifier hnv = new HostnameVerifier() {
@Override public boolean verify(String hostname, SSLSession session) { return true; } };
public String getUrl(String urladdress, String dict) throws IOException, InterruptedException { URL url = new URL(urladdress); HttpURLConnection httpConn; int SleepTime = 1; while (true) { httpConn = (HttpURLConnection) url.openConnection(); httpConn.setReadTimeout(60000); httpConn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)"); httpConn.setRequestProperty("Content-Language", "UTF-8"); httpConn.setRequestProperty("Connection", "Keep-Alive");
if (httpConn.getResponseCode() > 500) { System.out.println("read " + httpConn.getResponseCode() + " from " + urladdress + " sleep " + SleepTime + " minute."); readOneDictPayTime = readOneDictPayTime + 100; System.out.println("adjust readOneDictPayTime to " + readOneDictPayTime); if (readOneDictPayTime > 10000) { readOneDictPayTime = 2500; } Thread.sleep(60000 * SleepTime); SleepTime = SleepTime + 1; } else if (httpConn.getResponseCode() > 400) { return ""; } else { break; } }
InputStream in = httpConn.getInputStream(); try { String html = getHtmlFromStream(in); return moin2wm(html, urladdress, dict); } finally { in.close(); } }
public String moin2wm(String html, String url, String dict) throws UnsupportedEncodingException { String text = html; if (text.indexOf("This page does not exist yet.") > 0) { return ""; } String ex = "UbuntuHelp"; String turl = "https://help.ubuntu.com/community/"; if (url.startsWith("https://wiki")) { ex = "UbuntuWiki"; turl = "https://wiki.ubuntu.com/"; }
String head = "
文章出处: |
{{#if: | [" + url.substring(0, url.indexOf("?")) + " {{{2}}}] | " + url.substring(0, url.indexOf("?")) + " }} |
\r\n
点击翻译: |
English {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/af | • {{#if: " + ex + ":" + dict + "|Afrikaans| Afrikaans}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ar | • {{#if: " + ex + ":" + dict + "|العربية| العربية}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/az | • {{#if: " + ex + ":" + dict + "|azərbaycanca| azərbaycanca}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/bcc | • {{#if: " + ex + ":" + dict + "|جهلسری بلوچی| جهلسری بلوچی}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/bg | • {{#if: " + ex + ":" + dict + "|български| български}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/br | • {{#if: " + ex + ":" + dict + "|brezhoneg| brezhoneg}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ca | • {{#if: " + ex + ":" + dict + "|català| català}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/cs | • {{#if: " + ex + ":" + dict + "|čeština| čeština}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/de | • {{#if: " + ex + ":" + dict + "|Deutsch| Deutsch}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/el | • {{#if: " + ex + ":" + dict + "|Ελληνικά| Ελληνικά}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/es | • {{#if: " + ex + ":" + dict + "|español| español}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/fa | • {{#if: " + ex + ":" + dict + "|فارسی| فارسی}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/fi | • {{#if: " + ex + ":" + dict + "|suomi| suomi}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/fr | • {{#if: " + ex + ":" + dict + "|français| français}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/gu | • {{#if: " + ex + ":" + dict + "|ગુજરાતી| ગુજરાતી}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/he | • {{#if: " + ex + ":" + dict + "|עברית| עברית}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/hu | • {{#if: " + ex + ":" + dict + "|magyar| magyar}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/id | • {{#if: " + ex + ":" + dict + "|Bahasa Indonesia| Bahasa Indonesia}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/it | • {{#if: " + ex + ":" + dict + "|italiano| italiano}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ja | • {{#if: " + ex + ":" + dict + "|日本語| 日本語}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ko | • {{#if: " + ex + ":" + dict + "|한국어| 한국어}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ksh | • {{#if: " + ex + ":" + dict + "|Ripoarisch| Ripoarisch}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/mr | • {{#if: " + ex + ":" + dict + "|मराठी| मराठी}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ms | • {{#if: " + ex + ":" + dict + "|Bahasa Melayu| Bahasa Melayu}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/nl | • {{#if: " + ex + ":" + dict + "|Nederlands| Nederlands}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/no | • {{#if: " + ex + ":" + dict + "|norsk| norsk}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/oc | • {{#if: " + ex + ":" + dict + "|occitan| occitan}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/pl | • {{#if: " + ex + ":" + dict + "|polski| polski}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/pt | • {{#if: " + ex + ":" + dict + "|português| português}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ro | • {{#if: " + ex + ":" + dict + "|română| română}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/ru | • {{#if: " + ex + ":" + dict + "|русский| русский}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/si | • {{#if: " + ex + ":" + dict + "|සිංහල| සිංහල}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/sq | • {{#if: " + ex + ":" + dict + "|shqip| shqip}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/sr | • {{#if: " + ex + ":" + dict + "|српски / srpski| српски / srpski}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/sv | • {{#if: " + ex + ":" + dict + "|svenska| svenska}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/th | • {{#if: " + ex + ":" + dict + "|ไทย| ไทย}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/tr | • {{#if: " + ex + ":" + dict + "|Türkçe| Türkçe}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/vi | • {{#if: " + ex + ":" + dict + "|Tiếng Việt| Tiếng Việt}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/yue | • {{#if: " + ex + ":" + dict + "|粵語| 粵語}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/zh | • {{#if: " + ex + ":" + dict + "|中文| 中文}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/zh-hans | • {{#if: " + ex + ":" + dict + "|中文(简体)| 中文(简体)}}|}} {{#ifexist: {{#if: " + ex + ":" + dict + " | " + ex + ":" + dict + " | {{#if: 用户 | 用户:}}Wikibot}}/zh-hant | • {{#if: " + ex + ":" + dict + "|中文(繁體)| 中文(繁體)}}|}} |
{{#ifeq:" + ex + ":" + dict + "|用户:Wikibot|请不要直接编辑翻译本页,本页将定期与来源同步。}} |
{{#ifexist: 用户:Wikibot/zh | | {{#ifexist: Wikibot/zh | | {{#ifeq: {{#titleparts:Wikibot|1|-1|}} | zh | | }} }} }} {{#ifeq: {{#titleparts:Wikibot|1|-1|}} | zh | | }} \r\n";
//删除注释 text = text.replaceAll("\r\n##(.*)", ""); text = text.replaceAll("\r\n#format(.*)", ""); text = text.replaceAll("\r\n#language(.*)", ""); text = text.replaceAll("\r\n#pragma(.*)", ""); text = text.replaceAll("\r\n#acl(.*)", ""); text = text.replaceAll("^##(.*)\r\n", ""); text = text.replaceAll("^#format(.*)\r\n", ""); text = text.replaceAll("^#language(.*)\r\n", ""); text = text.replaceAll("^#pragma(.*)\r\n", ""); text = text.replaceAll("^#acl(.*)\r\n", ""); //替换#REDIRECT PDFPrinting => #REDIRECT PDFPrinting text = text.replaceAll("#REDIRECT (\\S*)", "#REDIRECT " + "" + ex + ":$1"); text = text.replaceAll("#redirect (\\S*)", "#REDIRECT " + "" + ex + ":$1"); //#refresh 0 https://wiki.ubuntu.com/ASUS_A3H_5010_Laptop_with_Ubuntu text = text.replaceAll("#REFRESH (.*?) (\\S*)", "#REDIRECT " + "" + ex + ":$2"); text = text.replaceAll("#refresh (.*?) (\\S*)", "#REDIRECT " + "" + ex + ":$2");
//删除不要的下拉列表 text = text.replaceAll("\\[\\[Navigation\\(.*?\\)\\]\\]", ""); text = text.replaceAll("Navigation\\(.*?\\)", ""); //删除主题 text = text.replaceAll(".*TableOfContents.*", "");
//分离{{{ }}}
Pattern pattern = Pattern.compile("\\{\\{\\{(.*?)\\}\\}\\}");
Matcher matcher = pattern.matcher(text);
List<String> codes = new ArrayList();
while (matcher.find()) {
String code = matcher.group(1);
codes.add("" + code + "
");
int index = text.indexOf(matcher.group(0));
int len = matcher.group(0).length();
text = text.substring(0, index) + "@@code" + String.valueOf(codes.size()) + "@@" + text.substring(index + len);
}
pattern = Pattern.compile("\\{\\{\\{(.*?)\\}\\}\\}", Pattern.DOTALL); String html2 = text; matcher = pattern.matcher(html2); while (matcher.find()) { String code = matcher.group(1);
codes.add("
" + code + "
");
int index = text.indexOf("{{{" + matcher.group(1) + "}}}"); int len = matcher.group(1).length() + 6; text = text.substring(0, index) + "@@code" + String.valueOf(codes.size()) + "@@" + text.substring(index + len); }
//转化表格 text = tableConv(text);
//标题从二开始 text = text.replaceAll("= (.*?) =", "== $1 =="); //转化List text = replaceList(text); //BR ->
text = text.replaceAll("\\[\\[BR\\]\\]", "
"); //link convert superscripted - ^ * ^ -> * text = text.replaceAll("\\^(.*)\\^", "$1"); //link convert subscripted - ,, * ,, -> * text = text.replaceAll(",,(.*?),,", "$1");
//link convert - ["/*"] -> [dict/*] text = text.replaceAll("\\[\\[\"//(.*?)\\|(.*?)\"\\]\\]", "$2"); text = text.replaceAll("\\[\\[\"/(.*?)\\|(.*?)\"\\]\\]", "$2"); text = text.replaceAll("\\[\\[\"\\./(.*?)\\|(.*?)\"\\]\\]", "$2"); text = text.replaceAll("\\[\\[\"\\.\\./(.*?)\\|(.*?)\"\\]\\]", "[[" + ex + ":" + dict + "/../$1|$2]]");
text = text.replaceAll("\\[\\[\"//(.*?)\"\\]\\]", "$1"); text = text.replaceAll("\\[\\[\"/(.*?)\"\\]\\]", "$1"); text = text.replaceAll("\\[\\[\"\\./(.*?)\"\\]\\]", "$1"); text = text.replaceAll("\\[\\[\"\\.\\./(.*?)\"\\]\\]", "[[" + ex + ":" + dict + "/../$1|$1]]"); //link convert - /* -> dict/* //Introduction text = text.replaceAll("\\[\\[//(.*?)\\|(.*?)\\]\\]", "$2"); text = text.replaceAll("\\[\\[/(.*?)\\|(.*?)\\]\\]", "$2"); text = text.replaceAll("\\[\\[\\./(.*?)\\|(.*?)\\]\\]", "$2"); text = text.replaceAll("\\[\\[\\.\\./(.*?)\\|(.*?)\\]\\]", "[[" + ex + ":" + dict + "/../$1|$2]]");
text = text.replaceAll("\\[\\[//(.*?)\\]\\]", "$1"); text = text.replaceAll("\\[\\[/(.*?)\\]\\]", "$1"); text = text.replaceAll("\\[\\[\\./(.*?)\\]\\]", "$1"); text = text.replaceAll("\\[\\[\\.\\./(.*?)\\]\\]", "[[" + ex + ":" + dict + "/../$1|$1]]");
//link convert - [" * "] -> UbuntuHelp: * text = text.replaceAll("\\[\"(.*?)\"\\]", "$1");
//link convert - [# * ] -> * text = text.replaceAll("\\[#(.*?)\\]", "$1"); //link convert - [: / * : * ] -> * text = text.replaceAll("\\[:/(.*?):(.*?)\\]", "$2"); //link convert - [: * : * ] -> * text = text.replaceAll("\\[:(.*?):(.*?)\\]", "$2"); //link convert - [: / * ] -> UbuntuHelp: dict * text = text.replaceAll("\\[:/(.*?)\\]", "" + ex + ":" + dict + "/$1"); //link convert - [: * ] -> UbuntuHelp: * text = text.replaceAll("\\[:(.*?)\\]", "" + ex + ":$1"); //link convert - wiki:cat -> UbuntuWiki:cat //link convert - wiki:Ubuntu/cat -> UbuntuWiki:cat text = text.replaceAll(" wiki:Ubuntu/(\\S*)", " UbuntuWiki:$1"); text = text.replaceAll("\r\nwiki:Ubuntu/(\\S*)", "\r\nUbuntuWiki:$1"); text = text.replaceAll(" wiki:(\\S*)", " UbuntuWiki:$1"); text = text.replaceAll("\r\nwiki:(\\S*)", "\r\nUbuntuWiki:$1"); //link convert - [wiki:cat * ] -> * text = text.replaceAll("\\[wiki:Ubuntu/(.*?)\\ (.*?)\\]", "$2"); text = text.replaceAll("\\[wiki:(.*?)\\ (.*?)\\]", "$2"); //link convert - [wiki:cat * ] -> * text = text.replaceAll("\\[wiki:Ubuntu/(.*?)\\]", "UbuntuWiki:$1"); text = text.replaceAll("\\[wiki:(.*?)\\]", "UbuntuWiki:$1"); //link convert - [UbuntuWiki:\*] -> [UbuntuWiki:dict\*] text = text.replaceAll("\\[UbuntuWiki:\\\\(.*?)\\]", "[UbuntuWiki:" + dict + "\\$1]"); //link convert - [UbuntuHelp:\*] -> [UbuntuHelp:dict\*] text = text.replaceAll("\\[UbuntuHelp:\\\\(.*?)\\]", "[UbuntuHelp:" + dict + "\\$1]");
//Self:/ddd=dict/ Self:ddd=dict text = text.replaceAll(":Self:(.*?)//", ":" + dict + "/$1"); text = text.replaceAll(":Self:(.*?)/", ":" + dict + "/$1"); text = text.replaceAll(":Self:(.*?)", ":$1"); text = text.replaceAll(":self:(.*?)//", ":" + dict + "/$1"); text = text.replaceAll(":self:(.*?)/", ":" + dict + "/$1"); text = text.replaceAll(":self:(.*?)", ":$1");
//link convert - __ * __ -> * text = text.replaceAll("__(.*?)__", "$1");
//CategoryHomepage =>; text = text.replaceAll("Category(\\S*)", "");
text = text.replaceAll("\r\n( *)", "\r\n"); text = attachmentUrl(text, turl, dict); //xxx:http => http: text = text.replaceAll("\\[\\[(.*?):http(.*?)\\]\\]", "http$2"); //xxx:ftp => ftp: text = text.replaceAll("\\[\\[(.*?):ftp(.*?)\\]\\]", "ftp$2");
//Ubuntu:HardwareSupport =>UbuntuWiki:HardwareSupport text = text.replaceAll("\\[\\[Ubuntu:(.*?)\\]\\]", "UbuntuWiki:$1"); //[[1]] => UbuntuWiki: text = text.replaceAll("\\[\\(.*?)\\\\]", "$2"); text = text.replaceAll("\\[\\[2]\\]", "UbuntuWiki:$1"); //[[3]] => UbuntuHelp: text = text.replaceAll("\\[\\(.*?)\\\\]", "$2"); text = text.replaceAll("\\[\\[4]\\]", "UbuntuHelp:$1"); //[[5]] => UbuntuWiki: text = text.replaceAll("\\[\\(.*?)\\\\]", "$2"); text = text.replaceAll("\\[\\[6]\\]", "UbuntuWiki:$1"); //[[7]] => UbuntuHelp: text = text.replaceAll("\\[\\(.*?)\\\\]", "$2"); text = text.replaceAll("\\[\\[8]\\]", "UbuntuHelp:$1");
text = text.replaceAll("\\(.*?)\\", "$2"); text = text.replaceAll("\\(.*?)\\", "$2"); text = text.replaceAll("\\(.*?)\\", "$2"); text = text.replaceAll("\\(.*?)\\", "$2"); text = text.replaceAll("\\[9]", "UbuntuWiki:$1"); text = text.replaceAll("\\[10]", "UbuntuWiki:$1"); text = text.replaceAll("\\[11]", "UbuntuHelp:$1"); text = text.replaceAll("\\[12]", "UbuntuHelp:$1");
text = text.replaceAll("\\[\\[(.*?)/(.*?)/\\.\\./(.*?)\\]\\]", "$1/$3"); text = text.replaceAll("\\[\\[(.*?)/(.*?)/\\.\\./(.*?)\\]\\]", "$1/$3"); text = text.replaceAll("\\[\\[(.*?)/\\./(.*?)\\]\\]", "$1/$2");
//[[13]] => RedHat text = text.replaceAll("\\[\\[(http://.*?)\\%7C(.*?)\\]\\]", "[$1 $2]"); //[[14]] => RedHat text = text.replaceAll("\\[\\[(https://.*?)\\%7C(.*?)\\]\\]", "[$1 $2]"); //[[15]] => RedHat text = text.replaceAll("\\[\\[(ftp://.*?)\\%7C(.*?)\\]\\]", "[$1 $2]"); //[[16]] => [17] text = text.replaceAll("\\[\\[(http://.*?)\\]\\]", "[$1]"); //[[18]] => [19] text = text.replaceAll("\\[\\[(https://.*?)\\]\\]", "[$1]"); //[[20]] => [21] text = text.replaceAll("\\[\\[(ftp://.*?)\\]\\]", "[$1]");
//redhat => readhat text = text.replaceAll("\\[\\[(?!UbuntuWiki:|UbuntuHelp:|https|http|ftp|category)(.*?)\\|(.*?)\\]\\]", "$2");
//RedHat => ReadHat text = text.replaceAll("\\[\\[(?!UbuntuWiki:|UbuntuHelp:|https|http|ftp|category)(.*?)\\]\\]", "$1");
//转化% text = formatdict(text);
//转化单词 text = format2(text, ex, dict);
//需要还原$code$ for (int i = 0; i < codes.size(); i++) { String str = codes.get(i); String s = "@@code" + String.valueOf(i + 1) + "@@"; int m = text.indexOf(s); int n = s.length(); //text = text.substring(0, m) + str + text.substring(m + n); text = text.replace(s, str); }
String foot = "\r\n";
if (text.trim().startsWith("#REDIRECT")) {
// System.out.print(" "+text.trim());
pattern = Pattern.compile("\\[\\[" + ex + ":(.*?)\\]\\]"); matcher = pattern.matcher(text); if (matcher.find()) { //如果仅仅是大小写的重定向,就不用考虑直接忽略。 if (matcher.group(1).toLowerCase().equals(dict.toLowerCase())) { System.out.println(dict + " redirect to " + matcher.group(1) + " , ignore. "); return ""; } } return text + head + foot; } if (text.trim().length() < 10) { return ""; } return head + text + foot; }
public String formatdict(String text) { Pattern pattern = Pattern.compile("\\[\\[(.*?)\\]\\]"); Matcher matcher = pattern.matcher(text); String html = text; while (matcher.find()) { String dict = matcher.group(1); if (dict.indexOf("%") > 0) { int index = html.indexOf("" + dict + ""); int len = dict.length() + 4; html = html.substring(0, index) + "" + getDict(dict) + "" + html.substring(index + len); } } return html; }
public String getDict(String dict) { String str = ""; str = dict.replaceAll("%..", "_"); str = str.replaceAll("%.", ""); return str; }
//检查有没有至少两个大写单词存在,如果存在检查是不是词汇,如果是,转为链接。 public boolean arrayhas(char[] c, char b) { for (int i = 0; i < c.length; i++) { if (c[i] == b) { return true; } } return false; }
public String replaceBySpace(String text, String match) { Pattern pattern = Pattern.compile(match); Matcher matcher = pattern.matcher(text); String html = text; while (matcher.find()) { String str = matcher.group(0); int index = text.indexOf(str); int length = str.length(); String space = String.format("%-" + length + "s", ""); html = html.substring(0, index) + space + html.substring(index + length); } return html; }
public String format2(String html, String ex, String dict) { String result = html; int n_count = 0; int m_count = 0; char[] o = {':', '%', '?', '&', '=', '\\', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'}; char[] m = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; char[] n = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}; String temp = html; //删除现有的所有的链接 temp = replaceBySpace(temp, "\\{\\{.*?\\}\\}"); temp = replaceBySpace(temp, "=====.*?====="); temp = replaceBySpace(temp, "====.*?===="); temp = replaceBySpace(temp, "===.*?==="); temp = replaceBySpace(temp, "==.*?=="); temp = replaceBySpace(temp, "=.*?="); temp = replaceBySpace(temp, "\\[\\[.*?\\]\\]"); temp = replaceBySpace(temp, "\\[.*?\\]"); temp = replaceBySpace(temp, "http:.*? "); temp = replaceBySpace(temp, "https:.*? ");
temp = replaceBySpace(temp, "
.*?
");
temp = replaceBySpace(temp, ".*?
");
String word = "";
for (int i = 0; i < temp.length(); i++) {
char c = temp.charAt(i);
if (arrayhas(m, c)) {
word = word + String.valueOf(c);
m_count++;
} else if (arrayhas(n, c)) {
word = word + String.valueOf(c);
n_count++;
} else if (arrayhas(o, c)) {
word = word + String.valueOf(c);
} else {
//单词截取完成
if (n_count > 1 && m_count > 0) {
if (!word.equals("IconsPage") && !word.equals("AttachFile") &&
word.indexOf("\\") < 0 && word.indexOf("/") < 0 && findDict(word) && !word.equals(dict)) {
try {
int index = temp.indexOf(word);
int len = word.length();
String replace = "" + word + "";
result = result.substring(0, index) + replace + result.substring(index + len);
temp = temp.substring(0, index) + String.format("%-" + replace.length() + "s", "") + temp.substring(index + len);
} catch (Exception e) {
e.printStackTrace();
}
}
}
n_count = 0;
m_count = 0;
word = "";
}
}
return result;
}
public String urlEncode(String dict) { String str = dict; str = str.replaceAll(" ", "%20"); if (str.indexOf("#") > 0) { str = str.substring(0, str.indexOf("#")); } if (str.indexOf("?") > 0) { str = str.substring(0, str.indexOf("?")); } return str; }
public boolean checkListStart(String line) { String str = line; if (str.length() < 4) { return false; } if (str.charAt(0) == '.' && str.charAt(0) == ' ') { return true; } if (str.charAt(0) == '*' && str.charAt(0) == ' ') { return true; } char[] o = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}; char[] m = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; char[] n = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};
if (arrayhas(o, str.charAt(0)) || arrayhas(m, str.charAt(0)) || arrayhas(n, str.charAt(0))) { if (str.charAt(1) == '.' && str.charAt(2) == ' ') { return true; } } return false; }
public String replaceList(String text) { String[] lines = text.split("\r\n"); String block = ""; StringBuffer bf = new StringBuffer(); boolean start = false; for (int i = 0; i < lines.length; i++) { String line = lines[i]; String linetrim = line.trim(); if (linetrim.equals("")) { continue; } if (checkListStart(linetrim)) { if (!start) { start = true; block = line; } else { block = block + "\r\n" + line; } } else { if (start) { block = formatList2(block);
bf.append(block + "\r\n"); start = false; } bf.append(lines[i] + "\r\n"); } } if (start) { block = formatList2(block); bf.append(block + "\r\n"); start = false; } return bf.toString(); }
public static int formatList_getSpaceCount(String s) { int num = 0; for (int i = 0; i < s.length(); i++) { if (s.charAt(i) == ' ') { num++; } else { break; } } return num; }
public static String formatList_clear(String mstr) { String str = mstr.trim(); if (str.startsWith(".") || str.startsWith("*")) { return str.substring(1).trim(); } else { return str.substring(2).trim(); } }
public String formatList2(String text) { String[] mlist = text.split("\r\n");
// if (mlist.length<2) return text;
while (true) { boolean done = true; for (String line : mlist) { if (formatList_getSpaceCount(line) > 0) { done = false; break; } } if (done) { break; } int lay = 0; String lType = "ol"; for (int i = 0; i < mlist.length; i++) { int space = formatList_getSpaceCount(mlist[i]); if (space > 0 && lay == 0) { String str = mlist[i].trim(); if (str.startsWith("*")) { lType = "ul"; }
mlist[i] = "<" + lType + ">
</" + lType + ">"; } } else if (space > 0 && lay == space) { mlist[i] = "
</" + lType + ">"; } } else if (lay > 0 && space == 0) { mlist[i] = "</" + lType + ">" + mlist[i]; break; } else { continue; } } } StringBuffer sb = new StringBuffer(); for (String line : mlist) { sb.append(line + "\r\n"); } return sb.toString(); } public String formatList(String text) { /* 1. one 1. two 1. one * bullet 1 * bullet 2 1. two 1. three * bullet 1. one *最好是转为*=>
#=>
*/ //获得步进长度 text = text.replaceAll(" 1\\. ", " # "); text = text.replaceAll(" a\\. ", " # "); text = text.replaceAll(" A\\. ", " # "); text = text.replaceAll(" i\\. ", " # "); text = text.replaceAll(" I\\. ", " # "); text = text.replaceAll(" \\. ", " # "); String step = ""; for (int i = 0; i < text.length(); i++) { if (text.charAt(i) == ' ') { step = step + " "; } else { break; } }
//修改步长为" " if (step.length() > 0) { text = text.replaceAll(step, " "); } String[] lines = text.split("\r\n"); for (int i = 0; i < lines.length; i++) { if (lines[i].charAt(0) != ' ') { break; } lines[i] = lines[i].substring(1); }
StringBuffer bf = new StringBuffer(); bf.append(lines[0] + "\r\n"); for (int i = 1; i < lines.length; i++) { if (lines[i].startsWith(" ")) { char[] s = lines[i].toCharArray(); for (int j = 0; j < s.length; j++) { if (s[j] == ' ') { if (j >= lines[i - 1].length()) { s[j] = 0; } if (lines[i - 1].charAt(j) != '*' && lines[i - 1].charAt(j) != '#') { s[j] = 0; } else { s[j] = lines[i - 1].charAt(j); } } else { break; } } String l = ""; for (int j = 0; j < s.length; j++) { if (s[j] == 0) { continue; } l = l + s[j]; } lines[i] = l; } bf.append(lines[i] + "\r\n"); } return bf.toString(); }
public String attachmentUrl(String text, String baseurl, String dict) throws UnsupportedEncodingException { //attachment:IconsPage/info.png -> while (true) { Pattern pattern = Pattern.compile("\\{\\{attachment:(.*?)/(.*?)\\}\\}"); Matcher matcher = pattern.matcher(text); String replace; if (matcher.find()) { replace = baseurl + matcher.group(1) + "?action=AttachFile&do=get&target=" + URLEncoder.encode(matcher.group(2), "UTF-8"); text = text.substring(0, matcher.start(0)) + replace + text.substring(matcher.end(0)); continue; }
pattern = Pattern.compile("\\{\\{attachment:(.*?)\\}\\}"); matcher = pattern.matcher(text); if (matcher.find()) { replace = baseurl + dict + "?action=AttachFile&do=get&target=" + URLEncoder.encode(matcher.group(1), "UTF-8"); text = text.substring(0, matcher.start(0)) + replace + text.substring(matcher.end(0)); continue; }
break; } return text; }
public String tableConv(String html) { //||a||b||c|| -> {| //||d||e||f|| |a||b||c // |- // |d||e||f // |}
// System.out.println(html);
String[] lines = html.split("\r\n"); String block = ""; StringBuffer bf = new StringBuffer(); boolean start = false; for (int i = 0; i < lines.length; i++) { String line = lines[i].trim(); if (line.startsWith("||")) { if (line.length() < 4) { continue; //line like ||(.*?)|| } String str = line.substring(1, line.length() - 2);
Pattern pattern = Pattern.compile("<(.*?)>"); Matcher matcher = pattern.matcher(line); while (matcher.find()) { String x = matcher.group(1).trim(); String replace = ""; if (x.startsWith("-")) { if (x.indexOf(" ") > 0) { replace = "colspan=" + x.substring(1, x.indexOf(" ")) + "|"; } else { replace = "colspan=" + x.substring(1) + "|"; } } else if (x.startsWith("|")) { if (x.indexOf(" ") > 0) { replace = "rowspan=" + x.substring(1, x.indexOf(" ")) + "|"; } else { replace = "rowspan=" + x.substring(1) + "|"; } }
// System.out.println("match:"+matcher.group(1)+" to:"+replace);
int m = str.indexOf("<" + matcher.group(1) + ">"); int n = ("<" + matcher.group(1) + ">").length(); if (m == -1 || (m + n) > str.length()) { continue; } str = str.substring(0, m) + replace + str.substring(m + n);
// str=str.replaceFirst("<"+matcher.group(1)+">",replace);
}
if (!start) { start = true; block = "{|border=\"1\" cellspacing=\"0\"\r\n" + str; } else { block = block + "\r\n|-\r\n" + str; } } else { if (start) { block = block + "\r\n|}\r\n"; bf.append(block); start = false; } bf.append(lines[i] + "\r\n"); } } if (start) { block = block + "\r\n|}\r\n"; bf.append(block); start = false; }
return bf.toString(); }
public String getWikiHtml(String url) throws MalformedURLException, IOException { URL httpurl = new URL(url); HttpURLConnection httpConn = (HttpURLConnection) httpurl.openConnection(); httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.10) Gecko/2009042523 Ubuntu/9.04 (jaunty) Firefox/3.0.10"); httpConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpConn.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.5"); httpConn.setRequestProperty("Accept-Charset", "gb2312,utf-8;q=0.7,*;q=0.7"); httpConn.setRequestProperty("Referer", "http://wiki.ubuntu.org.cn/"); httpConn.setRequestProperty("Cookie", cookie); httpConn.setUseCaches(false); httpConn.setDoInput(true); InputStream in = httpConn.getInputStream(); String wikihtml = getHtmlFromStream(in); return wikihtml; }
public void putText(String dict, String html, String surl) throws MalformedURLException, IOException { URL url; URLConnection conn; InputStream in; BufferedReader read; StringBuffer sb; if (html.length() < 5) { System.out.println(dict + " is short , ignore. " + html); return; }
// if (html.length()<300){ // if (html.toUpperCase().trim().indexOf("REFRESH")>0) { // System.out.println(dict+" is REFRESH."); // return; // } // if (html.toUpperCase().trim().indexOf("REDIRECT")>0) { // System.out.println(dict+" is REDIRECT."); // return; // } // }
String ex = "UbuntuHelp"; if (surl.startsWith("https://wiki")) { ex = "UbuntuWiki"; }
String sdict = dict; if (sdict.indexOf("%") > 0) { sdict = getDict(dict); }
try { url = new URL("http://wiki.ubuntu.org.cn/" + ex + ":" + sdict + "?action=raw"); conn = url.openConnection(); conn.setReadTimeout(60000); conn.setRequestProperty("Cookie", cookie); in = conn.getInputStream(); String wikihtml = getHtmlFromStream(in); int wikilen = (wikihtml.replaceAll("\r", "")).replaceAll("\n", "").length(); int htmllen = (html.replaceAll("\r", "")).replaceAll("\n", "").length(); if (wikilen == htmllen) { System.out.println(dict + " no changes."); return; } else { System.out.println(dict + " length: " + html.length() + " oldlength: " + wikihtml.length()); } } catch (Exception ex0) { }
String tempdict = sdict.replaceAll("%23", ""); String address = "http://wiki.ubuntu.org.cn/" + ex + ":" + tempdict + "?action=edit"; String wikihtml = null; String from = null; Boolean flag = false; for (int i = 0; i < 5; i++) { wikihtml = getWikiHtml(address); int start = wikihtml.indexOf("<form id=\"editform\"");
int end = wikihtml.indexOf("