This commit is contained in:
root
2019-08-30 19:30:19 +02:00
commit 6f2b105ca0
595 changed files with 78405 additions and 0 deletions

View File

@ -0,0 +1,21 @@
import sys
'''
Concatenates all arguments together, assuming they are files, into the last argument file.
'''
if len(sys.argv) < 4:
print "Usage: file_concat.py <inputfile1>, <inputfile2>, ... <outputfile>"
exit()
else:
ins = sys.argv[1:-1]
out = sys.argv[-1]
outfile = open(out, 'w')
all_lines = []
for i in ins:
f = open(i, 'r')
lines = [x.strip() for x in f.readlines()]
all_lines += lines
outfile.write('\n'.join(all_lines))

View File

@ -0,0 +1,26 @@
import sys
import keyword_scraper
'''
Invokes keyword_scraper to sort a file of keywords
Example:
$ python keyword_scraper_tool.py geshi_lang_file.php somedir
'''
if len(sys.argv) < 2:
print "Usage: keyword_scraper_tool <inputfile> <outputfile>"
exit()
else:
infile_ = sys.argv[1]
outfile_ = sys.argv[2] if len(sys.argv) >= 3 else None
infile = open(infile_, 'r')
keywords = [x.strip() for x in infile.readlines()]
keywords.sort(keyword_scraper.cmp_keywords)
if outfile_:
outfile = open(outfile_, 'w')
outfile.write('\n'.join(keywords))

View File

@ -0,0 +1,73 @@
import re
import os
def cmp_keywords(x,y):
'''
Sorts keywords by length, and then alphabetically
'''
if len(x) < len(y):
return 1
elif len(x) == len(y):
# Sort alphabetically
if x == y:
return 0
elif x < y:
return -1
else:
return 1
else:
return -1
def keywords(infile, outdir):
'''
Scrapes comma separated keywords out of a file and sorts them in descending order of length.
It is assumed a keyword is surrounded in quotes ('' or ""), are grouped by commas and separated by line breaks.
The output is then printed and each group is written in text files in the given directory
An example use case for this is scraping keywords out of GeSHi language files:
>>> keywords('geshi_lang_file.php', 'somedir')
'''
if outdir and not os.path.exists(outdir):
os.makedirs(outdir)
f = open(infile, 'r')
fs = f.read()
fs = re.sub(r"(//.*?[\r\n])|(/\*.*?\*/)", '', fs)
matches = re.findall(r"(?:(?:'[^']+'|\"[^\"]+\")(?:[ \t]*[\r\n]?[ \t]*,[ \t]*[\r\n]?[ \t]*)?(?!\s*=>)){2,}", fs, flags=re.I | re.M | re.S)
output = ''
group = 0
for i in matches:
match = re.findall(r"'([^']+)'", i, flags=re.I | re.M | re.S)
match.sort(cmp=cmp_keywords)
suboutput = ''
for m in match:
m = m.strip()
if len(m) > 0:
suboutput += m + '\n'
suboutput += '\n'
if outdir:
w = open(outdir + '/' + str(group) + '.txt' , 'w')
w.write(suboutput)
output += suboutput
group += 1;
print output
exit()
matches = re.findall(r"(['\"])(.*?)\1", fs, re.I | re.M | re.S)
output = ''
if len(matches):
for m in matches:
s = m[1].strip()
if len(s) > 0:
output += s + '\n'
f.close()
print output
if w:
w.write(output)
w.close()

View File

@ -0,0 +1,18 @@
import sys
import keyword_scraper
'''
Invokes keyword_scraper over command line
Example:
$ python keyword_scraper_tool.py geshi_lang_file.php somedir
'''
if len(sys.argv) < 2:
print "Usage: keyword_scraper_tool <inputfile> [directory]"
exit()
else:
infile = sys.argv[1]
outdir = sys.argv[2] if len(sys.argv) >= 3 else None
keyword_scraper.keywords(infile, outdir)

1
util/scraper/readme.txt Normal file
View File

@ -0,0 +1 @@
I created this to help scrape keywords out of text files. Mostly I use it to scrape GeSHi language files and remove the bits I need.