Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions datadog_lambda/cold_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,28 @@
logger = logging.getLogger(__name__)

_cold_start = True
_proactive_initialization = False
_lambda_container_initialized = False


def set_cold_start():
def set_cold_start(init_timestamp_ns):
"""Set the value of the cold start global

This should be executed once per Lambda execution before the execution
"""
global _cold_start
global _lambda_container_initialized
_cold_start = not _lambda_container_initialized
global _proactive_initialization
if not _lambda_container_initialized:
now = time.time_ns()
if (now - init_timestamp_ns) // 1_000_000_000 > 10:
_cold_start = False
Copy link

@pvicente pvicente Jul 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @astuyve @joeyzhao2018 should it also be _cold_start = True? I'm getting use cold_start stats from apms and with this change I will need to take into account new proactive_initialization= true to measure it like before.

Based on AWS docs I'm not sure if in this situation cold_start should be true or false. Any thoughts?

Thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @pvicente - We're actively working with AWS on documentation for this and I can't say more right now, but you should open a support ticket with them and tell them AJ Stuyvenberg sent you ;)

A bit more about what I can say:
With this change you'll still get cold_start:true if the AWS Lambda function actually experienced a cold start, like so:
image

However if the function is proactively initialized, then it's not a cold start in the sense that there was no additional latency caused by sandbox initialization (as init occurred seconds, hours, or minutes before the first request was served by the sandbox).

We now tag those correctly, so when we use APM to select this python invocation:
image
We can see that initialization occurred 3m59s before the function was executed:
image

I hope I've helped!

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation @astuyve 🥇

_proactive_initialization = True
else:
_cold_start = not _lambda_container_initialized
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@astuyve I think on this case _proactive_initialization = False see my case from APM stats:

Screenshot 2023-07-07 at 14 28 28

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We initialize _proactive_initialization to False on line 9 so we shouldn't need to redeclare that here.

In this case rolling up to top 10 is skewing the results:
image
I can see something similar in one of my orgs:
image
But when I remove the pie chart grouping, I see many cold_start: true spans:
image

However for clarity, line 27 should always be false as we're already inside of a conditional checking for _lambda_container_initialized == false

else:
_cold_start = False
_proactive_initialization = False
_lambda_container_initialized = True


Expand All @@ -25,11 +36,25 @@ def is_cold_start():
return _cold_start


def is_proactive_init():
"""Returns the value of the global proactive_initialization"""
return _proactive_initialization


def is_new_sandbox():
return is_cold_start() or is_proactive_init()


def get_cold_start_tag():
"""Returns the cold start tag to be used in metrics"""
return "cold_start:{}".format(str(is_cold_start()).lower())


def get_proactive_init_tag():
"""Returns the proactive init tag to be used in metrics"""
return "proactive_initialization:{}".format(str(is_proactive_init()).lower())


class ImportNode(object):
def __init__(self, module_name, full_file_path, start_time_ns, end_time_ns=None):
self.module_name = module_name
Expand Down Expand Up @@ -115,7 +140,7 @@ def wrapped_find_spec(*args, **kwargs):

def initialize_cold_start_tracing():
if (
is_cold_start()
is_new_sandbox()
and os.environ.get("DD_TRACE_ENABLED", "true").lower() == "true"
and os.environ.get("DD_COLD_START_TRACING", "true").lower() == "true"
):
Expand Down
3 changes: 3 additions & 0 deletions datadog_lambda/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,7 @@ def create_function_execution_span(
context,
function_name,
is_cold_start,
is_proactive_init,
trace_context_source,
merge_xray_traces,
trigger_tags,
Expand All @@ -1235,6 +1236,8 @@ def create_function_execution_span(
"dd_trace": ddtrace_version,
"span.name": "aws.lambda",
}
if is_proactive_init:
tags["proactive_initialization"] = str(is_proactive_init).lower()
if trace_context_source == TraceContextSource.XRAY and merge_xray_traces:
tags["_dd.parent_source"] = trace_context_source
tags.update(trigger_tags)
Expand Down
17 changes: 13 additions & 4 deletions datadog_lambda/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@
from time import time_ns

from datadog_lambda.extension import should_use_extension, flush_extension
from datadog_lambda.cold_start import set_cold_start, is_cold_start, ColdStartTracer
from datadog_lambda.cold_start import (
set_cold_start,
is_cold_start,
is_proactive_init,
is_new_sandbox,
ColdStartTracer,
)
from datadog_lambda.constants import (
TraceContextSource,
XraySubsegment,
Expand Down Expand Up @@ -72,6 +78,8 @@

env_env_var = os.environ.get(DD_ENV, None)

init_timestamp_ns = time_ns()

"""
Usage:

Expand Down Expand Up @@ -245,7 +253,7 @@ def _inject_authorizer_span_headers(self, request_id):
def _before(self, event, context):
try:
self.response = None
set_cold_start()
set_cold_start(init_timestamp_ns)
submit_invocations_metric(context)
self.trigger_tags = extract_trigger_tags(event, context)
# Extract Datadog trace context and source from incoming requests
Expand All @@ -272,14 +280,15 @@ def _before(self, event, context):
context,
self.function_name,
is_cold_start(),
is_proactive_init(),
trace_context_source,
self.merge_xray_traces,
self.trigger_tags,
parent_span=self.inferred_span,
)
else:
set_correlation_ids()
if profiling_env_var and is_cold_start():
if profiling_env_var and is_new_sandbox():
self.prof.start(stop_on_exit=False, profile_children=True)
logger.debug("datadog_lambda_wrapper _before() done")
except Exception:
Expand All @@ -299,7 +308,7 @@ def _after(self, event, context):
self.trigger_tags, XraySubsegment.LAMBDA_FUNCTION_TAGS_KEY
)
should_trace_cold_start = (
dd_tracing_enabled and self.cold_start_tracing and is_cold_start()
dd_tracing_enabled and self.cold_start_tracing and is_new_sandbox()
)
if should_trace_cold_start:
trace_ctx = tracer.current_trace_context()
Expand Down
29 changes: 29 additions & 0 deletions tests/test_cold_start.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
import unittest
import datadog_lambda.cold_start as cold_start
from sys import modules, meta_path
Expand All @@ -6,6 +7,34 @@


class TestColdStartTracingSetup(unittest.TestCase):
def test_proactive_init(self):
cold_start._cold_start = True
cold_start._proactive_initialization = False
cold_start._lambda_container_initialized = False
fifteen_seconds_ago = time.time_ns() - 15_000_000_000
cold_start.set_cold_start(fifteen_seconds_ago)
self.assertTrue(cold_start.is_proactive_init())
self.assertTrue(cold_start.is_new_sandbox())
self.assertFalse(cold_start.is_cold_start())
self.assertEqual(
cold_start.get_proactive_init_tag(), "proactive_initialization:true"
)
self.assertEqual(cold_start.get_cold_start_tag(), "cold_start:false")

def test_cold_start(self):
cold_start._cold_start = True
cold_start._proactive_initialization = False
cold_start._lambda_container_initialized = False
one_second_ago = time.time_ns() - 1_000_000_000
cold_start.set_cold_start(one_second_ago)
self.assertFalse(cold_start.is_proactive_init())
self.assertTrue(cold_start.is_new_sandbox())
self.assertTrue(cold_start.is_cold_start())
self.assertEqual(
cold_start.get_proactive_init_tag(), "proactive_initialization:false"
)
self.assertEqual(cold_start.get_cold_start_tag(), "cold_start:true")

def test_initialize_cold_start_tracing(self):
cold_start._cold_start = True
cold_start.initialize_cold_start_tracing() # testing double wrapping
Expand Down
13 changes: 10 additions & 3 deletions tests/test_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,9 @@ def test_set_correlation_ids_handle_empty_trace_context(self):
class TestFunctionSpanTags(unittest.TestCase):
def test_function(self):
ctx = get_mock_context()
span = create_function_execution_span(ctx, "", False, {"source": ""}, False, {})
span = create_function_execution_span(
ctx, "", False, False, {"source": ""}, False, {}
)
self.assertEqual(span.get_tag("function_arn"), function_arn)
self.assertEqual(span.get_tag("function_version"), "$LATEST")
self.assertEqual(span.get_tag("resource_names"), "Function")
Expand All @@ -537,7 +539,9 @@ def test_function_with_version(self):
ctx = get_mock_context(
invoked_function_arn=function_arn + ":" + function_version
)
span = create_function_execution_span(ctx, "", False, {"source": ""}, False, {})
span = create_function_execution_span(
ctx, "", False, False, {"source": ""}, False, {}
)
self.assertEqual(span.get_tag("function_arn"), function_arn)
self.assertEqual(span.get_tag("function_version"), function_version)
self.assertEqual(span.get_tag("resource_names"), "Function")
Expand All @@ -546,7 +550,9 @@ def test_function_with_version(self):
def test_function_with_alias(self):
function_alias = "alias"
ctx = get_mock_context(invoked_function_arn=function_arn + ":" + function_alias)
span = create_function_execution_span(ctx, "", False, {"source": ""}, False, {})
span = create_function_execution_span(
ctx, "", False, False, {"source": ""}, False, {}
)
self.assertEqual(span.get_tag("function_arn"), function_arn)
self.assertEqual(span.get_tag("function_version"), function_alias)
self.assertEqual(span.get_tag("resource_names"), "Function")
Expand All @@ -558,6 +564,7 @@ def test_function_with_trigger_tags(self):
ctx,
"",
False,
False,
{"source": ""},
False,
{"function_trigger.event_source": "cloudwatch-logs"},
Expand Down