1
0
mirror of https://github.com/ytdl-org/youtube-dl.git synced 2024-11-16 14:27:53 +00:00

[jsinterp] Improve parsing

* support subset `... else if ...`
* support `while`
* add `RegExp` class
* generalise `new` support
* limited more debug strings
* matching test changes
This commit is contained in:
dirkf 2023-02-02 14:28:32 +00:00
parent 14ef89a8da
commit 295736c9cb
2 changed files with 154 additions and 55 deletions

View File

@ -11,8 +11,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import math import math
import re import re
from youtube_dl.compat import compat_re_Pattern
from youtube_dl.jsinterp import JS_Undefined, JSInterpreter from youtube_dl.jsinterp import JS_Undefined, JSInterpreter
@ -140,15 +138,23 @@ class TestJSInterpreter(unittest.TestCase):
''') ''')
self.assertTrue(math.isnan(jsi.call_function('x'))) self.assertTrue(math.isnan(jsi.call_function('x')))
def test_Date(self):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; }
''') ''')
self.assertEqual(jsi.call_function('x'), 86000) self.assertEqual(jsi.call_function('x'), 86000)
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x(dt) { return new Date(dt) - 0; } function x(dt) { return new Date(dt) - 0; }
''') ''')
self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000)
# date format m/d/y
jsi = JSInterpreter('''
function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; }
''')
self.assertEqual(jsi.call_function('x'), 86000)
def test_call(self): def test_call(self):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { return 2; } function x() { return 2; }
@ -181,6 +187,15 @@ class TestJSInterpreter(unittest.TestCase):
self.assertEqual(jsi.call_function('x'), 10) self.assertEqual(jsi.call_function('x'), 10)
""" # Unsupported """ # Unsupported
jsi = JSInterpreter('''
function x() {
if (0!=0) return 1;
else {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
"""
def test_elseif(self):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { function x() {
if (0!=0) {return 1} if (0!=0) {return 1}
@ -188,6 +203,16 @@ class TestJSInterpreter(unittest.TestCase):
else {return 10} else {return 10}
}''') }''')
self.assertEqual(jsi.call_function('x'), 10) self.assertEqual(jsi.call_function('x'), 10)
""" # Unsupported
jsi = JSInterpreter('''
function x() {
if (0!=0) return 1;
else if (1==0) {return 2}
else {return 10}
}''')
self.assertEqual(jsi.call_function('x'), 10)
# etc
""" """
def test_for_loop(self): def test_for_loop(self):
@ -197,6 +222,13 @@ class TestJSInterpreter(unittest.TestCase):
''') ''')
self.assertEqual(jsi.call_function('x'), 10) self.assertEqual(jsi.call_function('x'), 10)
def test_while_loop(self):
# function x() { a=0; while (a<10) {a++} a }
jsi = JSInterpreter('''
function x() { a=0; while (a<10) {a++} return a }
''')
self.assertEqual(jsi.call_function('x'), 10)
def test_switch(self): def test_switch(self):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x(f) { switch(f){ function x(f) { switch(f){
@ -415,13 +447,28 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let a=/,,[/,913,/](,)}/; return a; } function x() { let a=/,,[/,913,/](,)}/; return a; }
''') ''')
self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern) attrs = set(('findall', 'finditer', 'flags', 'groupindex',
'groups', 'match', 'pattern', 'scanner',
'search', 'split', 'sub', 'subn'))
self.assertTrue(set(dir(jsi.call_function('x'))) > attrs)
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let a=/,,[/,913,/](,)}/i; return a; } function x() { let a=/,,[/,913,/](,)}/i; return a; }
''') ''')
self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I)
jsi = JSInterpreter(r'''
function x() { let a=[/[)\\]/]; return a[0]; }
''')
self.assertEqual(jsi.call_function('x').pattern, r'[)\\]')
""" # fails
jsi = JSInterpreter(r'''
function x() { let a=100; a/=/[0-9]+/.exec('divide by 20 today')[0]; }
''')
self.assertEqual(jsi.call_function('x'), 5)
"""
def test_char_code_at(self): def test_char_code_at(self):
jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}')
self.assertEqual(jsi.call_function('x', 0), 116) self.assertEqual(jsi.call_function('x', 0), 116)

View File

@ -187,19 +187,6 @@ class LocalNameSpace(ChainMap):
class JSInterpreter(object): class JSInterpreter(object):
__named_object_counter = 0 __named_object_counter = 0
_RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing
# TODO: new pattern class to execute matches with these flags
'd': 1024, # Generate indices for substring matches
'g': 2048, # Global search
'i': re.I, # Case-insensitive search
'm': re.M, # Multi-line search
's': re.S, # Allows . to match newline characters
'u': re.U, # Treat a pattern as a sequence of unicode code points
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
}
_OBJ_NAME = '__youtube_dl_jsinterp_obj' _OBJ_NAME = '__youtube_dl_jsinterp_obj'
OP_CHARS = None OP_CHARS = None
@ -217,9 +204,48 @@ class JSInterpreter(object):
msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100])
super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
class JS_RegExp(object):
_RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing
# TODO: new pattern class to execute matches with these flags
'd': 1024, # Generate indices for substring matches
'g': 2048, # Global search
'i': re.I, # Case-insensitive search
'm': re.M, # Multi-line search
's': re.S, # Allows . to match newline characters
'u': re.U, # Treat a pattern as a sequence of unicode code points
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
}
def __init__(self, pattern_txt, flags=''):
if isinstance(flags, compat_str):
flags, _ = self.regex_flags(flags)
# Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern
# First, avoid https://github.com/python/cpython/issues/74534
self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags)
for name in dir(self.__self):
# Only these? Obviously __class__, __init__.
# PyPy creates a __weakref__ attribute with value None
# that can't be setattr'd but also can't need to be copied.
if name in ('__class__', '__init__', '__weakref__'):
continue
setattr(self, name, getattr(self.__self, name))
@classmethod
def regex_flags(cls, expr):
flags = 0
if not expr:
return flags, expr
for idx, ch in enumerate(expr):
if ch not in cls._RE_FLAGS:
break
flags |= cls._RE_FLAGS[ch]
return flags, expr[idx + 1:]
@classmethod @classmethod
def __op_chars(cls): def __op_chars(cls):
op_chars = set(';,') op_chars = set(';,[')
for op in cls._all_operators(): for op in cls._all_operators():
for c in op[0]: for c in op[0]:
op_chars.add(c) op_chars.add(c)
@ -231,17 +257,6 @@ class JSInterpreter(object):
namespace[name] = obj namespace[name] = obj
return name return name
@classmethod
def _regex_flags(cls, expr):
flags = 0
if not expr:
return flags, expr
for idx, ch in enumerate(expr):
if ch not in cls._RE_FLAGS:
break
flags |= cls._RE_FLAGS[ch]
return flags, expr[idx + 1:]
@classmethod @classmethod
def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): def _separate(cls, expr, delim=',', max_split=None, skip_delims=None):
if not expr: if not expr:
@ -328,7 +343,7 @@ class JSInterpreter(object):
try: try:
return opfunc(left_val, right_val) return opfunc(left_val, right_val)
except Exception as e: except Exception as e:
raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e) raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e)
def _index(self, obj, idx, allow_undefined=False): def _index(self, obj, idx, allow_undefined=False):
if idx == 'length': if idx == 'length':
@ -338,7 +353,7 @@ class JSInterpreter(object):
except Exception as e: except Exception as e:
if allow_undefined: if allow_undefined:
return JS_Undefined return JS_Undefined
raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e)
def _dump(self, obj, namespace): def _dump(self, obj, namespace):
try: try:
@ -352,6 +367,7 @@ class JSInterpreter(object):
allow_recursion -= 1 allow_recursion -= 1
should_return = False should_return = False
# fails on (eg) if (...) stmt1; else stmt2;
sub_statements = list(self._separate(stmt, ';')) or [''] sub_statements = list(self._separate(stmt, ';')) or ['']
expr = stmt = sub_statements.pop().strip() expr = stmt = sub_statements.pop().strip()
for sub_stmt in sub_statements: for sub_stmt in sub_statements:
@ -371,25 +387,30 @@ class JSInterpreter(object):
if expr[0] in _QUOTES: if expr[0] in _QUOTES:
inner, outer = self._separate(expr, expr[0], 1) inner, outer = self._separate(expr, expr[0], 1)
if expr[0] == '/': if expr[0] == '/':
flags, outer = self._regex_flags(outer) flags, outer = self.JS_RegExp.regex_flags(outer)
inner = re.compile(inner[1:], flags=flags) # , strict=True)) inner = self.JS_RegExp(inner[1:], flags=flags)
else: else:
inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) inner = json.loads(js_to_json(inner + expr[0])) # , strict=True))
if not outer: if not outer:
return inner, should_return return inner, should_return
expr = self._named_object(local_vars, inner) + outer expr = self._named_object(local_vars, inner) + outer
if expr.startswith('new '): new_kw, _, obj = expr.partition('new ')
obj = expr[4:] if not new_kw:
if obj.startswith('Date('): for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)),
left, right = self._separate_at_paren(obj[4:]) ('RegExp', self.JS_RegExp),
expr = unified_timestamp( ('Error', self.Exception)):
self.interpret_expression(left, local_vars, allow_recursion), False) if not obj.startswith(klass + '('):
continue
left, right = self._separate_at_paren(obj[len(klass):])
argvals = self.interpret_iter(left, local_vars, allow_recursion)
expr = konstr(*argvals)
if not expr: if not expr:
raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr) raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr)
expr = self._dump(int(expr * 1000), local_vars) + right expr = self._dump(expr, local_vars) + right
break
else: else:
raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr) raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr)
if expr.startswith('void '): if expr.startswith('void '):
left = self.interpret_expression(expr[5:], local_vars, allow_recursion) left = self.interpret_expression(expr[5:], local_vars, allow_recursion)
@ -430,24 +451,45 @@ class JSInterpreter(object):
(?P<try>try)\s*\{| (?P<try>try)\s*\{|
(?P<if>if)\s*\(| (?P<if>if)\s*\(|
(?P<switch>switch)\s*\(| (?P<switch>switch)\s*\(|
(?P<for>for)\s*\( (?P<for>for)\s*\(|
(?P<while>while)\s*\(
''', expr) ''', expr)
md = m.groupdict() if m else {} md = m.groupdict() if m else {}
if md.get('if'): if md.get('if'):
cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) cndn, expr = self._separate_at_paren(expr[m.end() - 1:])
if_expr, expr = self._separate_at_paren(expr.lstrip()) if expr.startswith('{'):
# TODO: "else if" is not handled if_expr, expr = self._separate_at_paren(expr)
else:
# may lose ... else ... because of ll.368-374
if_expr, expr = self._separate_at_paren(expr, delim=';')
else_expr = None else_expr = None
m = re.match(r'else\s*{', expr) m = re.match(r'else\s*(?P<block>\{)?', expr)
if m: if m:
if m.group('block'):
else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) else_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
else:
# handle subset ... else if (...) {...} else ...
# TODO: make interpret_statement do this properly, if possible
exprs = list(self._separate(expr[m.end():], delim='}', max_split=2))
if len(exprs) > 1:
if re.match(r'\s*if\s*\(', exprs[0]) and re.match(r'\s*else\b', exprs[1]):
else_expr = exprs[0] + '}' + exprs[1]
expr = (exprs[2] + '}') if len(exprs) == 3 else None
else:
else_expr = exprs[0]
exprs.append('')
expr = '}'.join(exprs[1:])
else:
else_expr = exprs[0]
expr = None
else_expr = else_expr.lstrip() + '}'
cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion))
ret, should_abort = self.interpret_statement( ret, should_abort = self.interpret_statement(
if_expr if cndn else else_expr, local_vars, allow_recursion) if_expr if cndn else else_expr, local_vars, allow_recursion)
if should_abort: if should_abort:
return ret, True return ret, True
if md.get('try'): elif md.get('try'):
try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) try_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
err = None err = None
try: try:
@ -484,8 +526,8 @@ class JSInterpreter(object):
if err: if err:
raise err raise err
elif md.get('for'): elif md.get('for') or md.get('while'):
constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) init_or_cond, remaining = self._separate_at_paren(expr[m.end() - 1:])
if remaining.startswith('{'): if remaining.startswith('{'):
body, expr = self._separate_at_paren(remaining) body, expr = self._separate_at_paren(remaining)
else: else:
@ -496,11 +538,12 @@ class JSInterpreter(object):
body = 'switch(%s){%s}' % (switch_val, body) body = 'switch(%s){%s}' % (switch_val, body)
else: else:
body, expr = remaining, '' body, expr = remaining, ''
start, cndn, increment = self._separate(constructor, ';') if md.get('for'):
start, cndn, increment = self._separate(init_or_cond, ';')
self.interpret_expression(start, local_vars, allow_recursion) self.interpret_expression(start, local_vars, allow_recursion)
while True: else:
if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): cndn, increment = init_or_cond, None
break while _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)):
try: try:
ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion)
if should_abort: if should_abort:
@ -509,6 +552,7 @@ class JSInterpreter(object):
break break
except JS_Continue: except JS_Continue:
pass pass
if increment:
self.interpret_expression(increment, local_vars, allow_recursion) self.interpret_expression(increment, local_vars, allow_recursion)
elif md.get('switch'): elif md.get('switch'):
@ -764,6 +808,10 @@ class JSInterpreter(object):
if idx >= len(obj): if idx >= len(obj):
return None return None
return ord(obj[idx]) return ord(obj[idx])
elif member == 'replace':
assertion(isinstance(obj, compat_str), 'must be applied on a string')
assertion(len(argvals) == 2, 'takes exactly two arguments')
return re.sub(argvals[0], argvals[1], obj)
idx = int(member) if isinstance(obj, list) else member idx = int(member) if isinstance(obj, list) else member
return obj[idx](argvals, allow_recursion=allow_recursion) return obj[idx](argvals, allow_recursion=allow_recursion)
@ -795,6 +843,10 @@ class JSInterpreter(object):
raise self.Exception('Cannot return from an expression', expr) raise self.Exception('Cannot return from an expression', expr)
return ret return ret
def interpret_iter(self, list_txt, local_vars, allow_recursion):
for v in self._separate(list_txt):
yield self.interpret_expression(v, local_vars, allow_recursion)
def extract_object(self, objname): def extract_object(self, objname):
_FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
obj = {} obj = {}