74 lines
1.6 KiB
Python
74 lines
1.6 KiB
Python
import re
|
|
import os
|
|
|
|
def cmp_keywords(x,y):
|
|
'''
|
|
Sorts keywords by length, and then alphabetically
|
|
'''
|
|
if len(x) < len(y):
|
|
return 1
|
|
elif len(x) == len(y):
|
|
# Sort alphabetically
|
|
if x == y:
|
|
return 0
|
|
elif x < y:
|
|
return -1
|
|
else:
|
|
return 1
|
|
else:
|
|
return -1
|
|
|
|
def keywords(infile, outdir):
|
|
'''
|
|
Scrapes comma separated keywords out of a file and sorts them in descending order of length.
|
|
It is assumed a keyword is surrounded in quotes ('' or ""), are grouped by commas and separated by line breaks.
|
|
The output is then printed and each group is written in text files in the given directory
|
|
|
|
An example use case for this is scraping keywords out of GeSHi language files:
|
|
|
|
>>> keywords('geshi_lang_file.php', 'somedir')
|
|
|
|
'''
|
|
if outdir and not os.path.exists(outdir):
|
|
os.makedirs(outdir)
|
|
|
|
f = open(infile, 'r')
|
|
fs = f.read()
|
|
fs = re.sub(r"(//.*?[\r\n])|(/\*.*?\*/)", '', fs)
|
|
|
|
matches = re.findall(r"(?:(?:'[^']+'|\"[^\"]+\")(?:[ \t]*[\r\n]?[ \t]*,[ \t]*[\r\n]?[ \t]*)?(?!\s*=>)){2,}", fs, flags=re.I | re.M | re.S)
|
|
output = ''
|
|
group = 0
|
|
for i in matches:
|
|
match = re.findall(r"'([^']+)'", i, flags=re.I | re.M | re.S)
|
|
match.sort(cmp=cmp_keywords)
|
|
suboutput = ''
|
|
for m in match:
|
|
m = m.strip()
|
|
if len(m) > 0:
|
|
suboutput += m + '\n'
|
|
suboutput += '\n'
|
|
if outdir:
|
|
w = open(outdir + '/' + str(group) + '.txt' , 'w')
|
|
w.write(suboutput)
|
|
output += suboutput
|
|
group += 1;
|
|
|
|
print output
|
|
|
|
exit()
|
|
matches = re.findall(r"(['\"])(.*?)\1", fs, re.I | re.M | re.S)
|
|
output = ''
|
|
if len(matches):
|
|
for m in matches:
|
|
s = m[1].strip()
|
|
if len(s) > 0:
|
|
output += s + '\n'
|
|
f.close()
|
|
print output
|
|
if w:
|
|
w.write(output)
|
|
w.close()
|
|
|
|
|