fix for invalid utf-8 strings in protobuf, add some more type annotations
Stefan Schuermans

Stefan Schuermans commited on 2026-03-04 20:34:35
Showing 2 changed files, with 72 additions and 54 deletions.

... ...
@@ -18,7 +18,7 @@ def read_event(proto_file):
18 18
     """
19 19
     # skip till after magic
20 20
     magic = proto_file.read(4)
21
-    while magic != b'upt0':
21
+    while magic != b"upt0":
22 22
         if len(magic) < 4:
23 23
             return None  # EOF
24 24
         magic = magic[1:] + proto_file.read(1)  # search magic byte for byte
... ...
@@ -26,7 +26,7 @@ def read_event(proto_file):
26 26
     size = proto_file.read(4)
27 27
     if len(size) < 4:
28 28
         return None  # EOF
29
-    size = struct.unpack('!L', size)[0]
29
+    size = struct.unpack("!L", size)[0]
30 30
     # read event data
31 31
     data = proto_file.read(size)
32 32
     if len(data) < size:
... ...
@@ -36,10 +36,13 @@ def read_event(proto_file):
36 36
     return pb2_ev
37 37
 
38 38
 
39
-class BaseEvent():
39
+class BaseEvent:
40 40
     """
41 41
     Base class for all events.
42 42
     """
43
+
44
+    # pylint: disable=too-few-public-methods
45
+
43 46
     def __init__(self, pb2_ev: pb2.event):
44 47
         """
45 48
         Initialize base event from PB2 event.
... ...
@@ -48,18 +51,25 @@ class BaseEvent():
48 51
         self._pb2_ev = pb2_ev
49 52
         self._timestamp = self._pb2GetTimespec(pb2_ev.timestamp)
50 53
 
51
-    def _pb2GetStringList(self, s_l: pb2.stringlist) -> list:
54
+    def _pb2GetString(self, s: str | bytes) -> str:
55
+        if isinstance(s, str):
56
+            return s
57
+        if isinstance(s, bytes):
58
+            return s.decode("utf-8", errors="replace")
59
+        return str(s)
60
+
61
+    def _pb2GetStringList(self, s_l: pb2.stringlist) -> list[str]:
52 62
         """
53 63
         Get PB2 string list as Python list.
54 64
         """
55
-        return [s for s in s_l.s]
65
+        return [self._pb2GetString(s) for s in s_l.s]
56 66
 
57 67
     def _pb2GetTimespec(self, t_s: pb2.timespec) -> float:
58 68
         """
59 69
         Get PB2 timespec value in seconds.
60 70
         """
61 71
         sec = t_s.sec
62
-        if t_s.HasField('nsec'):
72
+        if t_s.HasField("nsec"):
63 73
             sec += t_s.nsec * 1e-9
64 74
         return sec
65 75
 
... ...
@@ -123,13 +135,15 @@ class ProcBegin(ProcBeginOrEnd):
123 135
         super().__init__(pb2_ev)
124 136
         p_b = pb2_ev.proc_begin
125 137
         self._pid = p_b.pid
126
-        self._ppid = p_b.ppid if p_b.HasField('ppid') else None
127
-        self._exe = p_b.exe if p_b.HasField('exe') else None
128
-        self._cwd = p_b.cwd if p_b.HasField('cwd') else None
129
-        self._cmdline = self._pb2GetStringList(
130
-            p_b.cmdline) if p_b.HasField('cmdline') else None
131
-        self._environ = self._pb2GetStringList(
132
-            p_b.environ) if p_b.HasField('environ') else None
138
+        self._ppid = p_b.ppid if p_b.HasField("ppid") else None
139
+        self._exe = self._pb2GetString(p_b.exe) if p_b.HasField("exe") else None
140
+        self._cwd = self._pb2GetString(p_b.cwd) if p_b.HasField("cwd") else None
141
+        self._cmdline = (
142
+            self._pb2GetStringList(p_b.cmdline) if p_b.HasField("cmdline") else None
143
+        )
144
+        self._environ = (
145
+            self._pb2GetStringList(p_b.environ) if p_b.HasField("environ") else None
146
+        )
133 147
 
134 148
     @property
135 149
     def exe(self) -> str:
... ...
@@ -146,14 +160,14 @@ class ProcBegin(ProcBeginOrEnd):
146 160
         return self._cwd
147 161
 
148 162
     @property
149
-    def cmdline(self) -> list:
163
+    def cmdline(self) -> list[str]:
150 164
         """
151 165
         Command line arguments of process (list of strings).
152 166
         """
153 167
         return self._cmdline.copy()
154 168
 
155 169
     @property
156
-    def environ(self) -> list:
170
+    def environ(self) -> list[str]:
157 171
         """
158 172
         Environment variables of process (list of strings).
159 173
         """
... ...
@@ -174,21 +188,23 @@ class ProcEnd(ProcBeginOrEnd):
174 188
         super().__init__(pb2_ev)
175 189
         p_e = pb2_ev.proc_end
176 190
         self._pid = p_e.pid
177
-        self._ppid = p_e.ppid if p_e.HasField('ppid') else None
178
-        self._cpu_time = self._pb2GetTimespec(
179
-            p_e.cpu_time) if p_e.HasField('cpu_time') else None
180
-        self._user_time = self._pb2GetTimespec(
181
-            p_e.user_time) if p_e.HasField('user_time') else None
182
-        self._sys_time = self._pb2GetTimespec(
183
-            p_e.sys_time) if p_e.HasField('sys_time') else None
184
-        self._max_rss_kb = p_e.max_rss_kb if p_e.HasField(
185
-            'max_rss_kb') else None
186
-        self._min_flt = p_e.min_flt if p_e.HasField('min_flt') else None
187
-        self._maj_flt = p_e.maj_flt if p_e.HasField('maj_flt') else None
188
-        self._in_block = p_e.in_block if p_e.HasField('in_block') else None
189
-        self._ou_block = p_e.ou_block if p_e.HasField('ou_block') else None
190
-        self._n_v_csw = p_e.n_v_csw if p_e.HasField('n_v_csw') else None
191
-        self._n_iv_csw = p_e.n_iv_csw if p_e.HasField('n_iv_csw') else None
191
+        self._ppid = p_e.ppid if p_e.HasField("ppid") else None
192
+        self._cpu_time = (
193
+            self._pb2GetTimespec(p_e.cpu_time) if p_e.HasField("cpu_time") else None
194
+        )
195
+        self._user_time = (
196
+            self._pb2GetTimespec(p_e.user_time) if p_e.HasField("user_time") else None
197
+        )
198
+        self._sys_time = (
199
+            self._pb2GetTimespec(p_e.sys_time) if p_e.HasField("sys_time") else None
200
+        )
201
+        self._max_rss_kb = p_e.max_rss_kb if p_e.HasField("max_rss_kb") else None
202
+        self._min_flt = p_e.min_flt if p_e.HasField("min_flt") else None
203
+        self._maj_flt = p_e.maj_flt if p_e.HasField("maj_flt") else None
204
+        self._in_block = p_e.in_block if p_e.HasField("in_block") else None
205
+        self._ou_block = p_e.ou_block if p_e.HasField("ou_block") else None
206
+        self._n_v_csw = p_e.n_v_csw if p_e.HasField("n_v_csw") else None
207
+        self._n_iv_csw = p_e.n_iv_csw if p_e.HasField("n_iv_csw") else None
192 208
 
193 209
     @property
194 210
     def cpu_time(self) -> float:
... ...
@@ -286,8 +303,8 @@ def parse_event(proto_file, visitor: Visitor) -> bool:
286 303
     pb2_ev = read_event(proto_file)
287 304
     if pb2_ev is None:
288 305
         return False
289
-    if pb2_ev.HasField('proc_begin'):
306
+    if pb2_ev.HasField("proc_begin"):
290 307
         visitor.visitProcBegin(ProcBegin(pb2_ev))
291
-    if pb2_ev.HasField('proc_end'):
308
+    if pb2_ev.HasField("proc_end"):
292 309
         visitor.visitProcEnd(ProcEnd(pb2_ev))
293 310
     return True
... ...
@@ -9,13 +9,13 @@ import collections
9 9
 import uproctrace.parse
10 10
 
11 11
 
12
-class Process():
12
+class Process:
13 13
     """
14 14
     A process parsed from a trace.
15 15
     """
16 16
 
17 17
     # pylint: disable=R0904
18
-    def __init__(self, proc_id: int, pid: int):
18
+    def __init__(self, proc_id: int, pid: int) -> None:
19 19
         """
20 20
         Initialize process.
21 21
         """
... ...
@@ -27,7 +27,7 @@ class Process():
27 27
         self._children = collections.OrderedDict()  # proc_id -> Process
28 28
 
29 29
     @property
30
-    def begin_timestamp(self) -> list:
30
+    def begin_timestamp(self) -> float:
31 31
         """
32 32
         Begin timestamp of process.
33 33
         """
... ...
@@ -36,14 +36,14 @@ class Process():
36 36
         return self._begin.timestamp
37 37
 
38 38
     @property
39
-    def children(self) -> list:
39
+    def children(self) -> list["Process"]:
40 40
         """
41 41
         List of child processes.
42 42
         """
43 43
         return list(self._children.values())
44 44
 
45 45
     @property
46
-    def cmdline(self) -> list:
46
+    def cmdline(self) -> list[str]:
47 47
         """
48 48
         Command line of process.
49 49
         """
... ...
@@ -70,7 +70,7 @@ class Process():
70 70
         return self._begin.cwd
71 71
 
72 72
     @property
73
-    def end_timestamp(self) -> list:
73
+    def end_timestamp(self) -> float:
74 74
         """
75 75
         End timestamp of process.
76 76
         """
... ...
@@ -79,7 +79,7 @@ class Process():
79 79
         return self._end.timestamp
80 80
 
81 81
     @property
82
-    def environ(self) -> list:
82
+    def environ(self) -> list[str]:
83 83
         """
84 84
         Environment of process.
85 85
         """
... ...
@@ -160,21 +160,21 @@ class Process():
160 160
         return self._end.ou_block
161 161
 
162 162
     @property
163
-    def parent(self):
163
+    def parent(self) -> "Process | None":
164 164
         """
165 165
         Parent process (or None).
166 166
         """
167 167
         return self._parent
168 168
 
169 169
     @property
170
-    def pid(self):
170
+    def pid(self) -> int:
171 171
         """
172 172
         Linux process ID.
173 173
         """
174 174
         return self._pid
175 175
 
176 176
     @property
177
-    def ppid(self):
177
+    def ppid(self) -> int | None:
178 178
         """
179 179
         Linux process ID of parent process.
180 180
         """
... ...
@@ -187,7 +187,7 @@ class Process():
187 187
         return None
188 188
 
189 189
     @property
190
-    def proc_id(self):
190
+    def proc_id(self) -> int:
191 191
         """
192 192
         Process ID. (This is not the PID.)
193 193
         """
... ...
@@ -211,32 +211,32 @@ class Process():
211 211
             return None
212 212
         return self._end.user_time
213 213
 
214
-    def addChild(self, child):
214
+    def addChild(self, child) -> None:
215 215
         """
216 216
         Add a child process.
217 217
         """
218 218
         self._children[child.proc_id] = child
219 219
 
220
-    def removeChild(self, child_proc_id: int):
220
+    def removeChild(self, child_proc_id: int) -> None:
221 221
         """
222 222
         Remove a child process.
223 223
         """
224 224
         if child_proc_id in self._children:
225 225
             del self._children[child_proc_id]
226 226
 
227
-    def setBegin(self, proc_begin: uproctrace.parse.ProcBegin):
227
+    def setBegin(self, proc_begin: uproctrace.parse.ProcBegin) -> None:
228 228
         """
229 229
         Set begin event of process.
230 230
         """
231 231
         self._begin = proc_begin
232 232
 
233
-    def setEnd(self, proc_end: uproctrace.parse.ProcEnd):
233
+    def setEnd(self, proc_end: uproctrace.parse.ProcEnd) -> None:
234 234
         """
235 235
         Set end event of process.
236 236
         """
237 237
         self._end = proc_end
238 238
 
239
-    def setParent(self, parent):
239
+    def setParent(self, parent: "Process") -> None:
240 240
         """
241 241
         Set parent process.
242 242
         """
... ...
@@ -247,16 +247,20 @@ class Processes(uproctrace.parse.Visitor):
247 247
     """
248 248
     Collection of all processes from a trace.
249 249
     """
250
-    def __init__(self, proto_file):
250
+
251
+    def __init__(self, proto_file) -> None:
251 252
         """
252 253
         Initialize processes from a trace file (f).
253 254
         """
254 255
         super().__init__()
255
-        self._timeline = dict()  # time -> list(parse.BaseEvent)
256
-        self._all_processes = dict()  # proc_id -> process
257
-        self._current_processes = dict()  # pid -> process (while pid alive)
256
+        # time -> list(parse.BaseEvent)
257
+        self._timeline: dict[float, list[uproctrace.parse.BaseEvent]] = {}
258
+        # proc_id -> process
259
+        self._all_processes: dict[int, Process] = {}
260
+        # pid -> process (while pid alive)
261
+        self._current_processes: dict[int, Process] = {}
258 262
         # ordered dictionary of toplevel processes: proc_id -> Process
259
-        self._toplevel_processes = collections.OrderedDict()
263
+        self._toplevel_processes: dict[int, Process] = collections.OrderedDict()
260 264
         # parse trace
261 265
         self._readTrace(proto_file)
262 266
 
... ...
@@ -309,7 +313,7 @@ class Processes(uproctrace.parse.Visitor):
309 313
         Common processing for all events.
310 314
         """
311 315
         # store event in timeline
312
-        self._timeline.setdefault(event.timestamp, list()).append(event)
316
+        self._timeline.setdefault(event.timestamp, []).append(event)
313 317
 
314 318
     @property
315 319
     def toplevel(self) -> list:
316 320