diff --git a/src/subprocess_util/data_units.py b/src/subprocess_util/data_units.py
index 2e50bea..c495202 100644
--- a/src/subprocess_util/data_units.py
+++ b/src/subprocess_util/data_units.py
@@ -41,6 +41,11 @@ class DataUnitConverter:
     def to_unit(cls, num_bytes: int, unit: DataUnit = DataUnit.GB) -> float:
         return round(num_bytes / cls._unit_dict[unit], 3)
 
+    @classmethod
+    def to_unit_str(cls, num_bytes: int, unit: DataUnit = DataUnit.GB) -> str:
+        value = round(num_bytes / cls._unit_dict[unit], 3)
+        return cls.to_str(value, unit)
+
     @classmethod
     def to_unit_auto(cls, num_bytes: int) -> tuple[float, DataUnit]:
         for unit, factor in cls._unit_dict.items():
@@ -53,6 +58,10 @@ class DataUnitConverter:
     @classmethod
     def to_unit_auto_str(cls, num_bytes: int) -> str:
         value, unit = cls.to_unit_auto(num_bytes)
+        return cls.to_str(value, unit)
+
+    @classmethod
+    def to_str(cls, value: float, unit: DataUnit):
         return f'{value} {unit.value}'
 
 
diff --git a/src/subprocess_util/exec_consume_chunks.py b/src/subprocess_util/exec_consume_chunks.py
index c22eded..f5b6511 100644
--- a/src/subprocess_util/exec_consume_chunks.py
+++ b/src/subprocess_util/exec_consume_chunks.py
@@ -72,7 +72,8 @@ def _stdin_worker(queue_get: Queue.get,
                   binary_stdin: IO[AnyStr],
                   ):
     start_time = time.time()
-    transferred_bytes = 0
+    total_transferred_bytes = 0
+    last_time = start_time
 
     while True:
         chunk_path: Path
@@ -82,14 +83,23 @@ def _stdin_worker(queue_get: Queue.get,
         binary_stdin.write(chunk)
         # binary_stdin.flush()  # TODO: is this required?
 
-        # TODO: print total transfer speed and last chunk transfer speed
         current_time = time.time()
-        elapsed_time = current_time - start_time
-        transferred_bytes += len(chunk)
+        #
+        elapsed_time = current_time - last_time
+        total_elapsed_time = current_time - start_time
+        transferred_bytes = len(chunk)
+        total_transferred_bytes += transferred_bytes
         bytes_per_second = round(transferred_bytes / elapsed_time)
-        print(f'Elapsed time: {datetime.timedelta(seconds=elapsed_time)}\n'
-              f'Transferred: {DataUnitConverter.to_unit_auto_str(transferred_bytes)}\n'
-              f'Speed: {DataUnitConverter.to_unit_auto_str(bytes_per_second)}/s')
+        total_bytes_per_second = round(total_transferred_bytes / total_elapsed_time)
+        #
+        total_speed, total_speed_unit = DataUnitConverter.to_unit_auto(total_bytes_per_second)
+        print(f'Elapsed time:  {datetime.timedelta(seconds=total_elapsed_time)}\n'
+              f'Transferred:   {DataUnitConverter.to_unit_auto_str(total_transferred_bytes)}\n'
+              f'Total speed:   {DataUnitConverter.to_str(total_speed, total_speed_unit)}/s\n'
+              f'Current speed: {DataUnitConverter.to_unit_str(bytes_per_second, total_speed_unit)}/s'
+              )
+        #
+        last_time = current_time
 
         if last_chunk:
             break