fork
This commit is contained in:
21
util/scraper/file_concat.py
Normal file
21
util/scraper/file_concat.py
Normal file
@ -0,0 +1,21 @@
|
||||
import sys
|
||||
|
||||
'''
|
||||
Concatenates all arguments together, assuming they are files, into the last argument file.
|
||||
'''
|
||||
|
||||
if len(sys.argv) < 4:
|
||||
print "Usage: file_concat.py <inputfile1>, <inputfile2>, ... <outputfile>"
|
||||
exit()
|
||||
else:
|
||||
ins = sys.argv[1:-1]
|
||||
out = sys.argv[-1]
|
||||
outfile = open(out, 'w')
|
||||
|
||||
all_lines = []
|
||||
for i in ins:
|
||||
f = open(i, 'r')
|
||||
lines = [x.strip() for x in f.readlines()]
|
||||
all_lines += lines
|
||||
|
||||
outfile.write('\n'.join(all_lines))
|
26
util/scraper/keyword_join.py
Normal file
26
util/scraper/keyword_join.py
Normal file
@ -0,0 +1,26 @@
|
||||
import sys
|
||||
import keyword_scraper
|
||||
|
||||
'''
|
||||
Invokes keyword_scraper to sort a file of keywords
|
||||
|
||||
Example:
|
||||
|
||||
$ python keyword_scraper_tool.py geshi_lang_file.php somedir
|
||||
'''
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print "Usage: keyword_scraper_tool <inputfile> <outputfile>"
|
||||
exit()
|
||||
else:
|
||||
infile_ = sys.argv[1]
|
||||
outfile_ = sys.argv[2] if len(sys.argv) >= 3 else None
|
||||
|
||||
infile = open(infile_, 'r')
|
||||
keywords = [x.strip() for x in infile.readlines()]
|
||||
keywords.sort(keyword_scraper.cmp_keywords)
|
||||
|
||||
if outfile_:
|
||||
outfile = open(outfile_, 'w')
|
||||
outfile.write('\n'.join(keywords))
|
||||
|
73
util/scraper/keyword_scraper.py
Normal file
73
util/scraper/keyword_scraper.py
Normal file
@ -0,0 +1,73 @@
|
||||
import re
|
||||
import os
|
||||
|
||||
def cmp_keywords(x,y):
|
||||
'''
|
||||
Sorts keywords by length, and then alphabetically
|
||||
'''
|
||||
if len(x) < len(y):
|
||||
return 1
|
||||
elif len(x) == len(y):
|
||||
# Sort alphabetically
|
||||
if x == y:
|
||||
return 0
|
||||
elif x < y:
|
||||
return -1
|
||||
else:
|
||||
return 1
|
||||
else:
|
||||
return -1
|
||||
|
||||
def keywords(infile, outdir):
|
||||
'''
|
||||
Scrapes comma separated keywords out of a file and sorts them in descending order of length.
|
||||
It is assumed a keyword is surrounded in quotes ('' or ""), are grouped by commas and separated by line breaks.
|
||||
The output is then printed and each group is written in text files in the given directory
|
||||
|
||||
An example use case for this is scraping keywords out of GeSHi language files:
|
||||
|
||||
>>> keywords('geshi_lang_file.php', 'somedir')
|
||||
|
||||
'''
|
||||
if outdir and not os.path.exists(outdir):
|
||||
os.makedirs(outdir)
|
||||
|
||||
f = open(infile, 'r')
|
||||
fs = f.read()
|
||||
fs = re.sub(r"(//.*?[\r\n])|(/\*.*?\*/)", '', fs)
|
||||
|
||||
matches = re.findall(r"(?:(?:'[^']+'|\"[^\"]+\")(?:[ \t]*[\r\n]?[ \t]*,[ \t]*[\r\n]?[ \t]*)?(?!\s*=>)){2,}", fs, flags=re.I | re.M | re.S)
|
||||
output = ''
|
||||
group = 0
|
||||
for i in matches:
|
||||
match = re.findall(r"'([^']+)'", i, flags=re.I | re.M | re.S)
|
||||
match.sort(cmp=cmp_keywords)
|
||||
suboutput = ''
|
||||
for m in match:
|
||||
m = m.strip()
|
||||
if len(m) > 0:
|
||||
suboutput += m + '\n'
|
||||
suboutput += '\n'
|
||||
if outdir:
|
||||
w = open(outdir + '/' + str(group) + '.txt' , 'w')
|
||||
w.write(suboutput)
|
||||
output += suboutput
|
||||
group += 1;
|
||||
|
||||
print output
|
||||
|
||||
exit()
|
||||
matches = re.findall(r"(['\"])(.*?)\1", fs, re.I | re.M | re.S)
|
||||
output = ''
|
||||
if len(matches):
|
||||
for m in matches:
|
||||
s = m[1].strip()
|
||||
if len(s) > 0:
|
||||
output += s + '\n'
|
||||
f.close()
|
||||
print output
|
||||
if w:
|
||||
w.write(output)
|
||||
w.close()
|
||||
|
||||
|
18
util/scraper/keyword_scraper_tool.py
Normal file
18
util/scraper/keyword_scraper_tool.py
Normal file
@ -0,0 +1,18 @@
|
||||
import sys
|
||||
import keyword_scraper
|
||||
|
||||
'''
|
||||
Invokes keyword_scraper over command line
|
||||
|
||||
Example:
|
||||
|
||||
$ python keyword_scraper_tool.py geshi_lang_file.php somedir
|
||||
'''
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print "Usage: keyword_scraper_tool <inputfile> [directory]"
|
||||
exit()
|
||||
else:
|
||||
infile = sys.argv[1]
|
||||
outdir = sys.argv[2] if len(sys.argv) >= 3 else None
|
||||
keyword_scraper.keywords(infile, outdir)
|
1
util/scraper/readme.txt
Normal file
1
util/scraper/readme.txt
Normal file
@ -0,0 +1 @@
|
||||
I created this to help scrape keywords out of text files. Mostly I use it to scrape GeSHi language files and remove the bits I need.
|
Reference in New Issue
Block a user