collect.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. #
  2. # Metrix++, Copyright 2009-2019, Metrix++ Project
  3. # Link: https://github.com/metrixplusplus/metrixplusplus
  4. #
  5. # This file is a part of Metrix++ Tool.
  6. #
  7. from metrixpp.mpp import api
  8. import re
  9. import os
  10. import sys
  11. import logging
  12. import time
  13. import binascii
  14. import fnmatch
  15. import multiprocessing.pool
  16. class Plugin(api.Plugin, api.Parent, api.IConfigurable, api.IRunable):
  17. def __init__(self):
  18. self.reader = DirectoryReader()
  19. self.include_rules = []
  20. self.exclude_rules = []
  21. self.exclude_files = []
  22. self.parsers = []
  23. super(Plugin, self).__init__()
  24. def declare_configuration(self, parser):
  25. parser.add_option("--std.general.proctime", "--sgpt", action="store_true", default=False,
  26. help="If the option is set (True), the tool measures processing time per file [default: %default]")
  27. parser.add_option("--std.general.procerrors", "--sgpe", action="store_true", default=False,
  28. help="If the option is set (True), the tool counts number of processing/parsing errors per file [default: %default]")
  29. parser.add_option("--std.general.size", "--sgs", action="store_true", default=False,
  30. help="If the option is set (True), the tool collects file size metric (in bytes) [default: %default]")
  31. parser.add_option("--include-files", "--if", action='append',
  32. help="Adds a regular expression pattern to include files in processing (files have to match any rule to be included)")
  33. parser.add_option("--exclude-files", "--ef", action='append',
  34. help="Adds a regular expression pattern to exclude files or directories from processing")
  35. parser.add_option("--non-recursively", "--nr", action="store_true", default=False,
  36. help="If the option is set (True), sub-directories are not processed [default: %default]")
  37. self.optparser = parser
  38. def configure(self, options):
  39. self.is_proctime_enabled = options.__dict__['std.general.proctime']
  40. self.is_procerrors_enabled = options.__dict__['std.general.procerrors']
  41. self.is_size_enabled = options.__dict__['std.general.size']
  42. # check if any include rule is given
  43. if options.__dict__['include_files']:
  44. try:
  45. for include_rule in options.__dict__['include_files']:
  46. self.add_include_rule(re.compile(include_rule))
  47. except Exception as e:
  48. self.optparser.error("option --include-files: " + str(e))
  49. else:
  50. self.add_include_rule(re.compile(r'.*'))
  51. # check if any exclude rule is given
  52. if options.__dict__['exclude_files']:
  53. try:
  54. for exclude_rule in options.__dict__['exclude_files']:
  55. self.add_exclude_rule(re.compile(exclude_rule))
  56. except Exception as e:
  57. self.optparser.error("option --exclude-files: " + str(e))
  58. else:
  59. self.add_exclude_rule(re.compile(r'^[.]'))
  60. self.non_recursively = options.__dict__['non_recursively']
  61. def initialize(self):
  62. fields = []
  63. if self.is_proctime_enabled == True:
  64. fields.append(self.Field('proctime', float))
  65. if self.is_procerrors_enabled == True:
  66. fields.append(self.Field('procerrors', int))
  67. if self.is_size_enabled == True:
  68. fields.append(self.Field('size', int))
  69. super(Plugin, self).initialize(namespace='std.general', support_regions=False, fields=fields)
  70. self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_path())
  71. self.add_exclude_file(self.get_plugin('metrixpp.mpp.dbf').get_dbfile_prev_path())
  72. def run(self, args):
  73. if len(args) == 0:
  74. return self.reader.run(self, "./")
  75. retcode = 0
  76. for directory in args:
  77. retcode += self.reader.run(self, directory)
  78. return retcode
  79. def register_parser(self, fnmatch_exp_list, parser):
  80. self.parsers.append((fnmatch_exp_list, parser))
  81. def get_parser(self, file_path):
  82. for parser in self.parsers:
  83. for fnmatch_exp in parser[0]:
  84. if fnmatch.fnmatch(file_path, fnmatch_exp):
  85. return parser[1]
  86. return None
  87. def add_include_rule(self, re_compiled_pattern):
  88. self.include_rules.append(re_compiled_pattern)
  89. def add_exclude_rule(self, re_compiled_pattern):
  90. self.exclude_rules.append(re_compiled_pattern)
  91. def add_exclude_file(self, file_path):
  92. if file_path == None:
  93. return
  94. self.exclude_files.append(file_path)
  95. def is_file_excluded(self, file_name):
  96. # only apply the include rules to files - skip directories
  97. if os.path.isfile(file_name):
  98. for each in self.include_rules:
  99. if re.match(each, os.path.basename(file_name)) != None:
  100. break;
  101. # file is excluded if no include rule matches
  102. else:
  103. return True
  104. # check exclude rules for both, files and directories
  105. for each in self.exclude_rules:
  106. if re.match(each, os.path.basename(file_name)) != None:
  107. return True
  108. # finally check if a file is excluded directly
  109. for each in self.exclude_files:
  110. if os.path.basename(each) == os.path.basename(file_name):
  111. if os.stat(each) == os.stat(file_name):
  112. return True
  113. return False
  114. class DirectoryReader():
  115. def readtextfile(self,filename):
  116. """ Read a text file and try to detect the coding
  117. Since we examine program code text files we can assume the following:
  118. - There are no NUL characters, i.e. no 0x00 sequences of 1, 2 or 4
  119. byte, starting on 1, 2 or 4 byte boundaries (depending on
  120. 1, 2 or 4 byte coding)
  121. - There should at least one space (ASCII 0x20) char
  122. of the respective length (1,2 or 4 byte))
  123. - Program code consists of only ASCII chars, i.e. code < 128
  124. - Non ASCII chars should appear in string literals and comments only
  125. Though especially in the case of an 8 bit coding it does not matter
  126. which code page to use: Metric analysis is done on program code
  127. which is pure ASCII; string literals and comments are only recognized
  128. as such but not interpreted, though it doesn't matter if they contain
  129. non-ASCII chars whichever code page is used.
  130. Note the decoder's different behavior for the "utf_nn" identifiers:
  131. - .decode("utf_32") / .decode("utf_16"): preceding BOM is skipped
  132. - with suffix ".._be" or ".._le" respectively: preceding BOM is preserved
  133. but
  134. - .decode("utf_8"): preceding BOM is preserved
  135. - .decode("utf_8_sig"): preceding BOM is skipped
  136. """
  137. # Methods to check for various UTF variants without BOM:
  138. # Since UTF16/32 codings are recommended to use a BOM these methods
  139. # shouldn't be necessary but may be useful in certain cases.
  140. def checkforUTF32_BE(a):
  141. if ( (len(a) % 4) != 0 ): return False
  142. n = a.find(b'\x00\x00\x00\x20')
  143. return (n >= 0) and ((n % 4) == 0)
  144. def checkforUTF32_LE(a):
  145. if ( (len(a) % 4) != 0 ): return False
  146. n = a.find(b'\x20\x00\x00\x00')
  147. return (n >= 0) and ((n % 4) == 0)
  148. def checkforUTF16_BE(a):
  149. if ( (len(a) % 2) != 0 ): return False
  150. n = a.find(b'\x00\x20')
  151. return (n >= 0) and ((n % 2) == 0)
  152. def checkforUTF16_LE(a):
  153. if ( (len(a) % 2) != 0 ): return False
  154. n = a.find(b'\x20\x00')
  155. return (n >= 0) and ((n % 2) == 0)
  156. # Method to check for UTF8 without BOM:
  157. # "a" is the textfile represented as a simple byte array!
  158. # Find first char with code > 127:
  159. #
  160. # 1 nothing found: all bytes 0..127; in this case "a" only consists
  161. # of ASCII chars but this may also be treated as valid UTF8 coding
  162. #
  163. # 2 Code is a valid UTF8 leading byte: 176..271
  164. # then check subsequent bytes to be UTF8 extension bytes: 128..175
  165. # Does also do some additional plausibility checks:
  166. # If a valid UTF8 byte sequence is found
  167. # - the subsequent byte (after the UTF8 sequence) must be an ASCII
  168. # - or another UTF8 leading byte (in the latter case we assume that there
  169. # are following the appropriate number of UTF8 extension bytes..)
  170. # Note that these checks don't guarantee the text is really UTF8 encoded:
  171. # If a valid UTF8 sequence is found but in fact the text is some sort
  172. # of 8 bit OEM coding this may be coincidentally a sequence of 8 bit
  173. # OEM chars. This indeed seems very unlikely but may happen...
  174. # Even though the whole text would examined for UTF8 sequences: every
  175. # valid UTF8 sequence found may also be a sequence of OEM chars!
  176. #
  177. # 3 Code is not a valid UTF8 leading byte: 128..175 or 272..255
  178. # In this case coding is some sort of 8 bit OEM coding. Since we don't
  179. # know the OEM code page the file was written with, we assume "latin_1"
  180. # (is mostly the same as ANSI but "ansi" isn't available on Python 2)
  181. #
  182. # return suggested text coding: "ascii","utf_8" or "latin_1" (resp. default)
  183. def checkforUTF8(a,default="latin_1"):
  184. # Since "a" is a string array on Python 2 we use a special ORD function:
  185. # Convert c to its byte representation if it is a character
  186. # Works for Python 2+3
  187. def ORD(c): return ord(c) if (type(c) == str) else c
  188. L = len(a)
  189. n = 0
  190. while ( (n < L) and (ORD(a[n]) < 128) ): # (a[n] < ExtASCII) ):
  191. n = n+1
  192. if ( n >= L ): # all chars < 128: ASCII coding
  193. return "ascii" # but may also be treated as UTF8!
  194. w = a[n]
  195. # UTF8 two byte sequence: leading byte + 1 extension byte
  196. if ORD(w) in range(192,224):
  197. if ( (n+1 < L)
  198. and (ORD(a[n+1]) in range(128,192)) # valid UTF8 extension byte
  199. ):
  200. if ((n+2 == L) # w is last character
  201. or (ORD(a[n+2]) < 128) # or next byte is an ASCII char
  202. or (ORD(a[n+2]) in range(192,244)) # or next byte is an UTF8 leading byte
  203. ):
  204. return "utf_8"
  205. return default
  206. # UTF8 three byte sequence: leading byte + 2 extension bytes
  207. if ORD(w) in range(224,240):
  208. if ( (n+2 < L)
  209. and (ORD(a[n+1]) in range(128,192)) # 2 valid UTF8 extension bytes
  210. and (ORD(a[n+2]) in range(128,192))
  211. ):
  212. if ((n+3 == L) # w is last character
  213. or (ORD(a[n+3]) < 128) # or next byte is ASCII char
  214. or (ORD(a[n+3]) in range(192,244)) # or next byte is UTF8 leading byte
  215. ):
  216. return "utf_8"
  217. return default
  218. # UTF8 four byte sequence: leading byte + 3 extension bytes
  219. if ORD(w) in range(240,244):
  220. if ( (n+3 < L)
  221. and (ORD(a[n+1]) in range(128,192)) # 3 valid UTF8 extension bytes
  222. and (ORD(a[n+2]) in range(128,192))
  223. and (ORD(a[n+3]) in range(128,192))
  224. ):
  225. if ((n+4 == L) # w is last character
  226. or (ORD(a[n+4]) < 128) # or next byte is ASCII char
  227. or (ORD(a[n+4]) in range(192,244)) # or next byte is UTF8 leading byte
  228. ):
  229. return "utf_8"
  230. return default
  231. # no valid UTF8 byte sequence:
  232. return default;
  233. # end of checkforUTF8 ------------------------------------------------
  234. # ----------------------------------------------------------------------
  235. # Subroutine readtextfile
  236. # open as binary and try to guess the encoding
  237. # attention:
  238. # - Phyton 3: "a" is a binary array
  239. # - Python 2: "a" is string array!
  240. # ----------------------------------------------------------------------
  241. f = open(filename, 'rb');
  242. a = f.read();
  243. f.close()
  244. # check for codings with BOM:
  245. # Consider the order: Check for UTF32 first!
  246. if (a.startswith(b'\xff\xfe\x00\x00')
  247. or a.startswith(b'\x00\x00\xfe\xff')):
  248. coding = "utf_32" # no suffix _be/_le --> decoder skips the BOM
  249. elif (a.startswith(b'\xff\xfe')
  250. or a.startswith(b'\xfe\xff')):
  251. coding = "utf_16" # no suffix _be/_le --> decoder skips the BOM
  252. elif a.startswith(b'\xef\xbb\xbf'):
  253. coding = "utf_8_sig"
  254. # elif: there are some other codings with BOM - feel free to add them here
  255. # check for UTF variants without BOM:
  256. # Consider the order: Check for UTF32 first!
  257. elif checkforUTF32_BE(a):
  258. coding = "utf_32_be"
  259. elif checkforUTF32_LE(a):
  260. coding = "utf_32_le"
  261. elif checkforUTF16_BE(a):
  262. coding = "utf_16_be"
  263. elif checkforUTF16_LE(a):
  264. coding = "utf_16_le"
  265. # So finally we only have to look for UTF8 without BOM:
  266. else:
  267. coding = checkforUTF8(a)
  268. # decode to text with found coding; since our guess may be wrong
  269. # we replace unknown chars to avoid errors. Cause we examine program code
  270. # files (i.e. true program code should only consist of ASCII chars) these
  271. # replacements only should affect string literals and comments and should
  272. # have no effect on metric analysis.
  273. text = a.decode(coding,'replace')
  274. # Finally replace possible line break variants with \n:
  275. # todo: replace with a regex
  276. text = text.replace("\r\n","\n")
  277. text = text.replace("\r","\n")
  278. return text
  279. # end of readtextfile --------------------------------------------------
  280. def run(self, plugin, directory):
  281. IS_TEST_MODE = False
  282. if 'METRIXPLUSPLUS_TEST_MODE' in list(os.environ.keys()):
  283. IS_TEST_MODE = True
  284. def run_per_file(plugin, fname, full_path):
  285. exit_code = 0
  286. norm_path = re.sub(r'''[\\]''', "/", full_path)
  287. if os.path.isabs(norm_path) == False and norm_path.startswith('./') == False:
  288. norm_path = './' + norm_path
  289. if plugin.is_file_excluded(norm_path) == False:
  290. if os.path.isdir(full_path):
  291. if plugin.non_recursively == False:
  292. exit_code += run_recursively(plugin, full_path)
  293. else:
  294. parser = plugin.get_parser(full_path)
  295. if parser == None:
  296. logging.info("Skipping: " + norm_path)
  297. else:
  298. logging.info("Processing: " + norm_path)
  299. ts = time.time()
  300. text = self.readtextfile(full_path)
  301. #text = self.readfile_org(full_path)
  302. checksum = binascii.crc32(text.encode('utf8')) & 0xffffffff # to match python 3
  303. db_loader = plugin.get_plugin('metrixpp.mpp.dbf').get_loader()
  304. (data, is_updated) = db_loader.create_file_data(norm_path, checksum, text)
  305. procerrors = parser.process(plugin, data, is_updated)
  306. if plugin.is_proctime_enabled == True:
  307. data.set_data('std.general', 'proctime',
  308. (time.time() - ts) if IS_TEST_MODE == False else 0.01)
  309. if plugin.is_procerrors_enabled == True and procerrors != None and procerrors != 0:
  310. data.set_data('std.general', 'procerrors', procerrors)
  311. if plugin.is_size_enabled == True:
  312. data.set_data('std.general', 'size', len(text))
  313. db_loader.save_file_data(data)
  314. #logging.debug("-" * 60)
  315. exit_code += procerrors
  316. else:
  317. logging.info("Excluding: " + norm_path)
  318. return exit_code
  319. #thread_pool = multiprocessing.pool.ThreadPool()
  320. #def mp_worker(args):
  321. # run_per_file(args[0], args[1], args[2])
  322. def run_recursively(plugin, directory):
  323. exit_code = 0
  324. #thread_pool.map(mp_worker,
  325. # [(plugin, f, os.path.join(subdir, f))
  326. # for subdir, dirs, files in os.walk(directory) for f in files])
  327. for fname in sorted(os.listdir(directory)):
  328. full_path = os.path.join(directory, fname)
  329. exit_code += run_per_file(plugin, fname, full_path)
  330. return exit_code
  331. if os.path.exists(directory) == False:
  332. logging.error("Skipping (does not exist): " + directory)
  333. return 1
  334. if os.path.isdir(directory):
  335. total_errors = run_recursively(plugin, directory)
  336. else:
  337. total_errors = run_per_file(plugin, os.path.basename(directory), directory)
  338. total_errors = total_errors # used, warnings are per file if not zero
  339. return 0 # ignore errors, collection is successful anyway