fork

2019-08-30 19:30:19 +02:00
commit 6f2b105ca0
595 changed files with 78405 additions and 0 deletions
--- a/util/scraper/file_concat.py
+++ b/util/scraper/file_concat.py
@@ -0,0 +1,21 @@
+import sys
+
+'''
+Concatenates all arguments together, assuming they are files, into the last argument file.
+'''
+
+if len(sys.argv) < 4:
+	print "Usage: file_concat.py <inputfile1>, <inputfile2>, ...  <outputfile>"
+	exit()
+else:
+	ins = sys.argv[1:-1]
+	out = sys.argv[-1]
+	outfile = open(out, 'w')
+
+	all_lines = []
+	for i in ins:
+		f = open(i, 'r')
+		lines = [x.strip() for x in f.readlines()]
+		all_lines += lines
+
+	outfile.write('\n'.join(all_lines))
--- a/util/scraper/keyword_join.py
+++ b/util/scraper/keyword_join.py
@@ -0,0 +1,26 @@
+import sys
+import keyword_scraper
+
+'''
+Invokes keyword_scraper to sort a file of keywords
+	
+Example:
+
+	$ python keyword_scraper_tool.py geshi_lang_file.php somedir
+'''
+
+if len(sys.argv) < 2:
+	print "Usage: keyword_scraper_tool <inputfile> <outputfile>"
+	exit()
+else:
+	infile_ = sys.argv[1]
+	outfile_ = sys.argv[2] if len(sys.argv) >= 3 else None
+
+	infile = open(infile_, 'r')
+	keywords = [x.strip() for x in infile.readlines()]
+	keywords.sort(keyword_scraper.cmp_keywords)
+
+	if outfile_:
+		outfile = open(outfile_, 'w')
+		outfile.write('\n'.join(keywords))
+	
--- a/util/scraper/keyword_scraper.py
+++ b/util/scraper/keyword_scraper.py
@@ -0,0 +1,73 @@
+import re
+import os
+
+def cmp_keywords(x,y):
+	'''
+	Sorts keywords by length, and then alphabetically
+	'''
+	if len(x) < len(y):
+		return 1
+	elif len(x) == len(y):
+		# Sort alphabetically
+		if x == y:
+			return 0
+		elif x < y:
+			return -1
+		else:
+			return 1
+	else:
+		return -1
+
+def keywords(infile, outdir):
+	'''
+	Scrapes comma separated keywords out of a file and sorts them in descending order of length.
+	It is assumed a keyword is surrounded in quotes ('' or ""), are grouped by commas and separated by line breaks.
+	The output is then printed and each group is written in text files in the given directory
+
+	An example use case for this is scraping keywords out of GeSHi language files:
+
+		>>> keywords('geshi_lang_file.php', 'somedir')
+
+	'''
+	if outdir and not os.path.exists(outdir):
+		os.makedirs(outdir)
+
+	f = open(infile, 'r')
+	fs = f.read()
+	fs = re.sub(r"(//.*?[\r\n])|(/\*.*?\*/)", '', fs)
+
+	matches = re.findall(r"(?:(?:'[^']+'|\"[^\"]+\")(?:[ \t]*[\r\n]?[ \t]*,[ \t]*[\r\n]?[ \t]*)?(?!\s*=>)){2,}", fs, flags=re.I | re.M | re.S)
+	output = ''
+	group = 0
+	for i in matches:
+		match = re.findall(r"'([^']+)'", i, flags=re.I | re.M | re.S)
+		match.sort(cmp=cmp_keywords)
+		suboutput = ''
+		for m in match:
+			m = m.strip()
+			if len(m) > 0:
+				suboutput += m + '\n'
+		suboutput += '\n'
+		if outdir:
+			w = open(outdir + '/' + str(group) + '.txt' , 'w')
+			w.write(suboutput)
+		output += suboutput
+		group += 1;
+
+	print output
+
+	exit()
+	matches = re.findall(r"(['\"])(.*?)\1", fs, re.I | re.M | re.S)
+	output = ''
+	if len(matches):
+		for m in matches:
+			s = m[1].strip()
+			if len(s) > 0:
+				output += s + '\n'
+	f.close()
+	print output
+	if w:
+		w.write(output)
+		w.close()
+
+
--- a/util/scraper/keyword_scraper_tool.py
+++ b/util/scraper/keyword_scraper_tool.py
@@ -0,0 +1,18 @@
+import sys
+import keyword_scraper
+
+'''
+Invokes keyword_scraper over command line
+	
+Example:
+
+	$ python keyword_scraper_tool.py geshi_lang_file.php somedir
+'''
+
+if len(sys.argv) < 2:
+	print "Usage: keyword_scraper_tool <inputfile> [directory]"
+	exit()
+else:
+	infile = sys.argv[1]
+	outdir = sys.argv[2] if len(sys.argv) >= 3 else None
+	keyword_scraper.keywords(infile, outdir)
--- a/util/scraper/readme.txt
+++ b/util/scraper/readme.txt
@@ -0,0 +1 @@
+I created this to help scrape keywords out of text files. Mostly I use it to scrape GeSHi language files and remove the bits I need.
				`@@ -0,0 +1 @@`
				`I created this to help scrape keywords out of text files. Mostly I use it to scrape GeSHi language files and remove the bits I need.`