From 6bf1e4363edf940cb409f87636c541b69260cf02 Mon Sep 17 00:00:00 2001
From: chaoskagami <chaos.kagami@gmail.com>
Date: Tue, 26 Jan 2016 00:05:24 -0500
Subject: [PATCH] Initial commit

---
 4ch-to-mw.py | 262 +++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md    |  11 +++
 2 files changed, 273 insertions(+)
 create mode 100755 4ch-to-mw.py
 create mode 100644 README.md

diff --git a/4ch-to-mw.py b/4ch-to-mw.py
new file mode 100755
index 0000000..f915cb0
--- /dev/null
+++ b/4ch-to-mw.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+########################################################################
+# Configuration properties.
+
+#-----------------------------------#
+# Bland mirror preset.
+
+# do_filter = False
+
+#-----------------------------------#
+# Preset for sup/tg/ 'Magical Girl Noir'
+
+# Enable filtered save by nametag.
+do_filter = True
+# Names to consider top-level when filtering.
+filter_names = [ "Deculture", "Protoculture" ]
+# If filter is enabled, level of filter.
+filter_level = -1
+# Delete images unrelated to saved export when filtering.
+remove_unrelated_images = True
+#-----------------------------------#
+
+########################################################################
+
+import json
+from pprint import pprint
+import os
+import re
+import sys
+from subprocess import call
+import shutil
+from lxml.html import parse, tostring
+import requests
+
+if len(sys.argv) < 2:
+	sys.exit("Required ID argument missing.")
+
+########################################################################
+# Fetch the URL, js and css, and all images (not thumbs) from sup/tg/
+def fetch_suptg(post_url, post_id):
+	os.mkdir(post_id)
+	os.mkdir(post_id + "/t")
+	os.mkdir(post_id + "/t/images")
+
+	open(post_id + "/t/index.html", 'wb').write(requests.get(post_url).content)
+
+	page_html = parse(post_id + "/t/index.html").getroot()
+	posts = page_html.find_class("post")
+	for p in posts:
+		img = ""
+		try:
+			href = p.find_class("fileThumb")[0].get("href")
+			img_url = post_url + href
+
+			open(post_id + "/t/" + href, 'wb').write(requests.get(img_url).content)
+		except:
+			pass
+
+########################################################################
+# Fetch the URL, js and css, and all images (not thumbs) from 4chan
+def fetch_4c(post_url, post_id):
+	os.mkdir(post_id)
+	os.mkdir(post_id + "/t")
+	os.mkdir(post_id + "/t/images")
+
+	img_re = re.compile("//i.4cdn.org/[a-zA-Z]+/")
+
+	page_html = parse(post_url)
+	posts = page_html.getroot().find_class("post")
+	for p in posts:
+		img = ""
+		try:
+			href = p.find_class("fileThumb")[0].get("href")
+			img_url = "http:" + href
+			img_out = post_id + "/t/images/" + img_re.sub("", href)
+
+			p.find_class("fileThumb")[0].set("href", "images/" + img_re.sub("", href))
+
+			open(img_out, 'wb').write(requests.get(img_url).content)
+		except:
+			pass
+	open(post_id + "/t/index.html", 'wb').write(tostring(page_html.getroot()))
+
+########################################################################
+# Extract posts from HTML (replaces dump.sh)
+def post_split(thr_id):
+    ret = []
+    page_html = parse("index.html").getroot()
+    posts = page_html.find_class("post")
+    for p in posts:
+		data = {'id':"",'subj':"",'name':"",'time':"",'img':"",'msg':""}
+
+		data['id']   = p.find_class("postInfo")[0].get("id").replace("pi", "")
+		try:
+			data['subj'] = p.find_class("subject")[0].text_content()
+		except:
+			pass
+		data['name'] = p.find_class("name")[0].text_content()
+		data['time'] = p.find_class("dateTime")[0].get("data-utc")
+		try:
+			data['img']  = p.find_class("fileThumb")[0].get("href").replace("images/", "")
+		except:
+			pass
+		data['msg']  = p.find_class("postMessage")[0]
+		for e in list(data['msg'].iter()):
+			if e.tag == "br":
+				if e.tail:
+					e.tail = "\n" + e.tail
+				else:
+					e.tail = "\n"
+
+		data['msg'] = data['msg'].text_content()
+
+		ret.append(data)
+    return ret
+
+########################################################################
+# Dumps post list from post_split as text.
+def dump_text(out_file, post_list):
+	f = open(out_file, "wb")
+	for p in post_list:
+		header = p['name'] + " | " + p['time'] + " | No." + p['id']
+		if p['subj'] != "":
+			header = p['subj'] + " | " + header
+		f.write(header + "\n")
+		if p['img'] != "":
+			f.write(p['img'] + "\n")
+		f.write(p['msg'] + "\n")
+		f.write("----------------------------------\n")
+
+########################################################################
+# Dumps post list from post_split as mediawiki markup.
+def dump_mediawiki_markup(out_file, post_list, post_id):
+	f = open(out_file, "wb")
+	f.write(("<mediawiki xml:lang=\"en\"><page><title>Thread " + post_id + "</title><revision><text>").encode('utf-8'))
+	f.write(("<!-- Autogenerated by 4ch-dump.py -->\n").encode('utf-8'))
+	for p in post_list:
+		header = p['name'] + " " + p['time'] + " No." + p['id']
+		if p['subj'] != "":
+			header = p['subj'] + " " + header
+		f.write("<b>" + header + "</b>\n")
+		if p['img'] != "":
+			f.write("[[File:" + p['img'] + "|thumb|left|200x200px]]\n")
+		f.write("<pre style='font-family: sans-serif;'>\n")
+		encoded_msg = p['msg'].replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+		f.write((encoded_msg + "\n").encode('utf-8'))
+		f.write("</pre>\n")
+		f.write(("----------------------------------\n").encode('utf-8'))
+	f.write(("</text></revision></page></mediawiki>").encode('utf-8'))
+
+########################################################################
+# Retrieves references from within text as a list.
+def get_refs(post):
+	ref_re = re.compile('>>[0-9]+')
+	refs = ref_re.findall(post['msg'])
+	ret = []
+	if refs:
+		for r in refs:
+			ret.append(r.replace(">>", ""))
+	return ret
+
+########################################################################
+# Filters post list by name, optionally also by references to a depth.
+# If link_level is -1, this means 'resolve all'. If 0 or nil, no refs.
+# Any positive number is interpreted as a depth.
+def filter_posts(post_list, names, link_level):
+	ret = []
+	ids = []
+	for p in post_list:
+		done = False
+		for n in names:
+			if done:
+				break
+			if p['name'] == n:
+				ids.append(p['id'])
+				done = True
+
+	while link_level > 0 or link_level == -1:
+		changed = False
+		for p in post_list:
+			for i in ids:
+				if p['id'] == i:
+					refs = get_refs(p)
+					length = len(ids)
+					ids = ids + refs
+					ids = list(set(ids))
+					if length != len(ids):
+						changed = True
+		if changed == False:
+			break
+		if link_level > 0:
+			link_level -= 1
+
+	for p in post_list:
+		for i in ids:
+			if p['id'] == i:
+				ret.append(p)
+				break
+
+	ret.sort(key=lambda x: x['id'])
+	return ret
+
+thr_url   = sys.argv[1]
+
+eof_clip  = re.compile("/$")
+head_clip = re.compile("http[s]?://")
+suptg     = re.compile("suptg.thisisnotatrueending.com/archive/")
+ch4       = re.compile("boards.4chan.org/[a-zA-Z]+/thread/")
+
+foot_url  = eof_clip.sub ("", thr_url)
+clip_url  = head_clip.sub("", foot_url)
+
+ch4_id   = ch4.sub  ("", clip_url)
+suptg_id = suptg.sub("", clip_url)
+
+check_id  = re.compile("^[0-9]+$")
+
+is_ch4   = check_id.match(ch4_id)
+is_suptg = check_id.match(suptg_id)
+
+thr_id = ""
+
+if is_suptg != None:
+	print("[4ch-dump] Type: sup/tg/ url")
+	print("[4ch-dump] Fetching...")
+	fetch_suptg(thr_url, suptg_id)
+	thr_id = suptg_id
+elif is_ch4 != None:
+	print("[4ch-dump] Type: 4chan url")
+	print("[4ch-dump] Fetching...")
+	fetch_4c(thr_url, ch4_id)
+	thr_id = ch4_id
+else:
+	print("ukn")
+
+print("[4ch-dump] Splitting posts...")
+os.chdir(thr_id + "/t")
+post_list = post_split(thr_id)
+
+if do_filter:
+	print("[4ch-dump] Applying filter...")
+	post_list = filter_posts(post_list, filter_names, filter_level)
+
+print("[4ch-dump] Dumping mediawiki text...")
+dump_mediawiki_markup(thr_id + ".xml", post_list, thr_id)
+
+print("[4ch-dump] Removing unreferenced images...")
+if do_filter and remove_unrelated_images:
+	os.rename("images", "images.old")
+	os.mkdir("images")
+	for p in post_list:
+		if p['img'] != "":
+			os.rename("images.old/" + p['img'], "images/" + p['img'])
+
+print("[4ch-dump] Deleting old shit...")
+os.rename("images", "../images")
+os.rename(thr_id + ".xml", "../" + thr_id + ".xml")
+os.chdir("..")
+
+shutil.rmtree("t")
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..dbfbbfd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,11 @@
+4ch-to-mw
+=====================
+This is a crappy-ass python tool that can be used to recompile a 4chan thread (or sup/tg/ archive) to a mediawiki dump.
+
+Why? Because...well. Threads after 5 on wiki.magicalgirlnoir.com are hard to read because. This was originally coded in bash. Thank god nobody saw that mess.
+
+Read the header for usage, for now. I'll make proper console syntax sometime soon. For now though; it works.
+
+You need:
+ * python2
+ * lxml
-- 
2.39.5