<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Biomed Eng</journal-id><journal-id journal-id-type="publisher-id">biomedeng</journal-id><journal-id journal-id-type="index">24</journal-id><journal-title>JMIR Biomedical Engineering</journal-title><abbrev-journal-title>JMIR Biomed Eng</abbrev-journal-title><issn pub-type="epub">2561-3278</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e88053</article-id><article-id pub-id-type="doi">10.2196/88053</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Increasing Large Language Model Accuracy for Care-Seeking Advice Using Prompts Reflecting Human Reasoning Strategies in the Real World: Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kopka</surname><given-names>Marvin</given-names></name><degrees>BSc, MSc, MPH, Dr rer medic, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Feufel</surname><given-names>Markus A</given-names></name><degrees>Dipl-Ing (FH), MSc, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Division of Ergonomics, Department of Psychology &#x0026; Ergonomics (IPA), Technische Universit&#x00E4;t Berlin</institution><addr-line>Stra&#x00DF;e des 17. Juni 135</addr-line><addr-line>Berlin</addr-line><addr-line>State of Berlin</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Baxter</surname><given-names>Clarence</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Marvin Kopka, BSc, MSc, MPH, Dr rer medic, PhD, Division of Ergonomics, Department of Psychology &#x0026; Ergonomics (IPA), Technische Universit&#x00E4;t Berlin, Stra&#x00DF;e des 17. Juni 135, Berlin, State of Berlin, 10623, Germany, 49 31470806; <email>marvin.kopka@tu-berlin.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>8</day><month>4</month><year>2026</year></pub-date><volume>11</volume><elocation-id>e88053</elocation-id><history><date date-type="received"><day>18</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>04</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>04</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Marvin Kopka, Markus A Feufel. Originally published in JMIR Biomedical Engineering (<ext-link ext-link-type="uri" xlink:href="http://biomsedeng.jmir.org">http://biomsedeng.jmir.org</ext-link>), 8.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Biomedical Engineering, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://biomedeng.jmir.org/">https://biomedeng.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://biomedeng.jmir.org/2026/1/e88053"/><abstract><sec><title>Background</title><p>Current prompting techniques for large language models (LLMs), such as ChatGPT, mainly focus on well-structured, low-uncertainty problems; yet, many real-world tasks (eg, care-seeking decisions) are ill-defined and involve high uncertainty. Naturalistic decision-making (NDM) specifically analyzes how humans make accurate decisions in such settings, but NDM concepts have not yet been applied to LLM prompt engineering.</p></sec><sec><title>Objective</title><p>This study aimed to determine whether prompting strategies inspired by NDM (specifically based on recognition-primed decision-making and the data-frame theory) could improve LLM performance in a real-world, high-uncertainty task, such as making care-seeking decisions.</p></sec><sec sec-type="methods"><title>Methods</title><p>We evaluated 10 ChatGPT models (GPT-4o, GPT-4.1, GPT-4.1 mini, o3, o4 mini, o4 mini high, GPT-5.1 Instant, GPT-5.1 Thinking, GPT-5.2 Instant, and GPT-5.2 Thinking) using 3 prompting strategies: a default prompt solely asking the LLMs to classify the case vignettes, a recognition-primed prompt tasking the models to reason according to recognition-primed decision-making, and a data-frame prompt tasking the models to apply the data-frame theory. The task was taken from a standardized and validated evaluation framework and instructed the LLMs to advise on the appropriate care-seeking action for 45 real patient case vignettes across 3 urgency levels (emergency, nonemergency, and self-care). Each model-vignette-prompt combination was tested 10 times to assess and account for output variability. Accuracy was analyzed using mixed effects logistic regression. Additionally, we evaluated accuracy for each urgency level and examined output variability.</p></sec><sec sec-type="results"><title>Results</title><p>Both NDM-inspired prompts increased overall model accuracy (recognition-primed: 67.6%; data-frame: 66.7%) compared to the default prompt (63.3%). The greatest improvements were observed for self-care recommendations, where accuracy increased from 13.4% (default prompt) to 29.8% (recognition-primed prompt) and 24.6% (data-frame prompt). Performance on 2 emergency and 30 nonemergency cases remained high across all prompts. Notably, NDM-inspired prompts made nonreasoning models start giving self-care advice, even though they rarely or never provided self-care advice with the default prompt. Output variability was similar across the 3 prompts.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Using LLMs with prompts inspired by NDM, which are designed to reflect real-world human reasoning, improves the accuracy of LLMs in care-seeking tasks, particularly for self-care advice, without reducing performance in the included emergency or nonemergency cases. These findings indicate that NDM-inspired prompts can offer an advantage when LLMs are used for real-world decisions involving ambiguity and uncertainty. The impact of output that reflects real-world human reasoning on users&#x2019; decision-making must be evaluated in future studies.</p></sec></abstract><kwd-group><kwd>prompting</kwd><kwd>human-technology interaction</kwd><kwd>human factors</kwd><kwd>artificial intelligence</kwd><kwd>decision-making</kwd><kwd>naturalistic decision-making</kwd><kwd>naturalistic decision support</kwd><kwd>cognitive science</kwd><kwd>care-seeking</kwd><kwd>self-triage</kwd><kwd>bounded rationality</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Since their public release in 2022, large language models (LLMs), such as ChatGPT, have become widely used across domains for a range of tasks [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Although these models now reach high levels of accuracy on several benchmark tests, both researchers and users are increasingly interested in techniques to further improve model performance through specific input instructions&#x2014;a process known as &#x201C;prompting&#x201D; [<xref ref-type="bibr" rid="ref9">9</xref>]. Common approaches include assigning the model a specific role, providing relevant context or examples, or specifying a clear output format [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Three basic prompting strategies are often described in the literature: zero-shot, one-shot, and few-shot prompting. Zero-shot prompting refers to providing only the task instructions without any example outputs. One-shot prompting includes a single example of the expected output, and few-shot prompting provides multiple examples of the expected output [<xref ref-type="bibr" rid="ref12">12</xref>]. In recent work, prompting strategies focus more on guiding the model through a reasoning process rather than simply providing information. A recent systematic review identified 58 prompting techniques, which were grouped into 6 categories [<xref ref-type="bibr" rid="ref13">13</xref>]. In addition to zero-shot and few-shot approaches, 4 new categories were described: <italic>ensembling prompts</italic> use multiple prompts and aggregate the resulting outputs [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. <italic>Self-criticism prompts</italic> instruct the model to evaluate and critique its own answers before responding [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. <italic>Decomposition prompts</italic> instruct LLMs to break down tasks into smaller steps, which are then solved sequentially [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Finally, <italic>thought generation</italic> or &#x201C;<italic>chain-of-thought&#x201D;</italic> prompts ask the model to explicitly explain its reasoning as it works through a problem [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Notably, chain-of-thought and reasoning prompts have now been directly integrated into newer models [<xref ref-type="bibr" rid="ref23">23</xref>]. For example, OpenAI&#x2019;s o-series models (including o1, o3, and o4) are designed to generate a reasoning response before generating a user response [<xref ref-type="bibr" rid="ref23">23</xref>]. This approach has been shown to improve accuracy across several benchmarks [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Starting with GPT-5, OpenAI also introduced a new, automatically included reasoning engine that consists of several internal expert models to which a user&#x2019;s request is routed [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. The model also automatically determines the reasoning effort needed to answer the user&#x2019;s request [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>Although the chain-of-thought prompting strategy is inspired by human reasoning, particularly deductive decision-making, there is an ongoing debate about whether LLMs really replicate human reasoning or simply generate plausible-sounding explanations [<xref ref-type="bibr" rid="ref28">28</xref>]. Shojaee et al [<xref ref-type="bibr" rid="ref29">29</xref>] recently tested chain-of-thought reasoning models on increasingly complex puzzles and found that LLMs do not engage in consistent reasoning across similar problems. In response, Lawsen [<xref ref-type="bibr" rid="ref30">30</xref>] argues that these findings can be attributed to experimental artifacts and that LLMs are indeed capable of reasoning consistently and accurately when experimental setups are properly designed. Regardless of whether LLMs truly mimic human deductive reasoning when prompted with reasoning techniques, using human decision-making as a source of inspiration for developing prompting strategies is a promising direction. This is especially true in situations with high uncertainty, where deductive reasoning quickly reaches its limits, but humans are nonetheless able to make fairly good decisions [<xref ref-type="bibr" rid="ref31">31</xref>].</p><p>Insights from the fields of applied psychology and human factors and ergonomics (HF/E) suggest that there is a gap between how humans reason in real-world situations and the assumed standard reasoning approaches related to deduction and induction, which are typically used to instruct and evaluate LLMs [<xref ref-type="bibr" rid="ref28">28</xref>]. One explanation for this difference may be that humans often make decisions under uncertainty, with incomplete or ambiguous information, and decision tasks and goals are often ill-structured [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>]. In contrast, most psychology experiments, as well as the current benchmarks for LLMs, rely on well-structured, multiple-choice tasks where all necessary information is explicitly provided, and the LLMs are merely asked to choose the correct answer out of multiple options, which can be readily evaluated against a clear-cut gold standard solution [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. This test format is also used in most educational assessments, and LLMs perform well on this format&#x2014;for example, passing professional and board certification exams in medicine, psychotherapy, and law. As a result, LLMs are widely promoted as accurate decision-support tools for well-structured tasks, and many users use them for this purpose [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref45">45</xref>]. However, when existing models are evaluated on real-world datasets, which more accurately reflect the complexity and ambiguity in real decision-making, their performance seems to be considerably worse [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref46">46</xref>].</p><p>The distinction between decision-making in idealized situations and in complex, ill-defined real-world settings has long been recognized in behavioral economics and psychology. In 1955, Herbert Simon introduced the concept of bounded rationality to study how human decision-making takes place under limited cognitive and environmental resources rather than under the conditions of perfect knowledge and unlimited resources, the normative ideal of full rationality, which is currently assumed in many LLM benchmarks [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. Building on the idea of bounded rationality, the field of naturalistic decision-making (NDM) developed to study how experts make good decisions in real-world contexts [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Research in NDM shows that, rather than exhaustively comparing all possible options, both experts and novices typically rely on a limited set of information to recognize the most promising option or action [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. This strategy is not perfectly accurate in all situations, but it often results in highly accurate decisions within short timeframes [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. Specifically, findings from the NDM field suggest that in situations with high information validity, experts often perform on par with complex algorithms, even when using less information and simpler strategies [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. This can be explained by the fact that experts rarely follow a strictly deductive or inductive process. Instead, they quickly recognize which information is relevant and engage in <italic>abductive reasoning</italic>; that is, they generate an initial hypothesis based on the observed piece of relevant information and then seek out information to test this hypothesis and update it as new contradicting information becomes available [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref56">56</xref>].</p><p>To describe human decision-making in real-world scenarios, 2 models feature most prominently in the NDM literature: recognition-primed decision-making (RPD) [<xref ref-type="bibr" rid="ref56">56</xref>] and the data-frame theory [<xref ref-type="bibr" rid="ref57">57</xref>]. The RPD model, which is used to make quick decisions in familiar situations, consists of 2 core processes: a pattern-matching loop and a mental simulation loop [<xref ref-type="bibr" rid="ref56">56</xref>]. In the pattern matching loop, decision-makers assess whether a situation is familiar (eg, they recognize whether they have experienced a similar situation before). If the situation is recognized as familiar, they directly implement an action. If it is unfamiliar, they either reassess the situation or seek additional information until they achieve some sense of familiarity and can proceed to the mental simulation loop. In the subsequent mental simulation loop, decision-makers simulate implementing their chosen course of action. If they conclude that this action will most likely work, they implement it; if not, they modify the plan and reassess, or they consider new actions entirely [<xref ref-type="bibr" rid="ref56">56</xref>].</p><p>The data-frame theory focuses more on sense-making and understanding new, unknown situations rather than on making decisions [<xref ref-type="bibr" rid="ref57">57</xref>]. According to this model, humans use frames (basic ideas, hypotheses, or mental models about what is happening in a given context) and data (information in the environment). These 2 concepts interact: frames determine what information is noticed, sought, and how it is interpreted. At the same time, new data can lead a person to elaborate, revise, or even abandon their current frame. For example, in medical diagnosis, a physician may form an initial hypothesis (frame) based on presenting symptoms, then gather additional data to either confirm or reconsider that frame based on new information [<xref ref-type="bibr" rid="ref57">57</xref>].</p><p>Although there is strong evidence supporting the occurrence, efficiency, and effectiveness of NDM models, such as the RPD model or the data-frame theory in real-world decision-making, these approaches have not yet been applied to instruct LLMs [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. Existing reasoning prompts and models are inspired by an ideal form of human decision-making and deductive reasoning, and they seem to perform well on well-structured problems with known risks and gold-standard solutions, and less so in situations involving real-world ambiguity and uncertainty. Although for the latter situation, NDM-based strategies may prove more effective, they have not yet been applied to improve and evaluate LLM performance in ill-defined, real-world tasks. In this study, we aimed to test whether prompts based on NDM principles can improve LLM performance on a real-world, ill-defined task.</p><p>Building on our previous work, we used a standardized and validated evaluation framework that used a care-seeking or &#x201C;self-triage&#x201D; decision scenario involving real patient cases to evaluate NDM-based prompts [<xref ref-type="bibr" rid="ref60">60</xref>]. Self-triage refers to the decision-making process by which people determine whether medical care is needed and, if so, where and how urgently to seek it (eg, self-care, primary care, or emergency care) [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. This is a common decision task in everyday life, with 80% to 90% of the population reporting at least 1 symptom within a given month [<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>], and laypeople are increasingly consulting digital tools, such as LLMs, for advice [<xref ref-type="bibr" rid="ref65">65</xref>,<xref ref-type="bibr" rid="ref66">66</xref>]. Previous research shows that human performance in self-triage decisions is moderate and that LLMs perform only slightly better on average, although they almost always recommend professional care rather than self-care [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref67">67</xref>-<xref ref-type="bibr" rid="ref69">69</xref>]. Self-triage is thus a suitable use case for the present study because it is typically ill-structured: information about symptoms may be incomplete and ambiguous, and decision-makers must decide under uncertainty. Therefore, self-triage is a representative example of the real-world decisions studied in NDM research.</p><p>We hypothesized that prompts inspired by the RPD model and the data-frame theory will significantly increase accuracy on these tasks in selecting the best course of action across both nonreasoning and reasoning models, compared to a standard zero-shot prompt.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This evaluation study was designed as a prospective, longitudinal, observational LLM validation study. The intervention was the specific prompting strategy: a regular prompt, a recognition-primed prompt, and a data-frame prompt. We used these 3 prompting strategies to assess 45 vignettes across 10 models, each tested 10 times. The primary outcome was the accuracy of the models under each prompting condition, and the secondary outcome was the output variability of the tested models. No participants were involved in this study.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study did not involve any prospective recruitment, interaction, or intervention with human participants. The LLM evaluation used an existing dataset of symptom descriptions originally collected on an online platform. Ethical approval for this collection, use, and deidentification of the cases was obtained from the ethics committee of the Department of Psychology and Ergonomics at Technische Universit&#x00E4;t Berlin (AWB_KOP_2_230711). For the present study, we accessed only the deidentified version of these cases [<xref ref-type="bibr" rid="ref70">70</xref>]. Pseudonymized identifiers (eg, user names) were completely removed, and potential quasi-identifiers in free text (eg, city or institution names) were deleted. The dataset was stored on an access-restricted institutional computer, and the data were used solely for model evaluation. No attempts were made to reidentify individuals. Accordingly, no additional ethical approval was required for this secondary analysis. The reporting of this manuscript follows the TRIPOD (Transparent Reporting of a Multivariable Model for Individual Prognosis or Diagnosis)&#x2013;LLM guideline [<xref ref-type="bibr" rid="ref71">71</xref>].</p></sec><sec id="s2-3"><title>Tested Models</title><p>Because ChatGPT remains the most widely used LLM family [<xref ref-type="bibr" rid="ref72">72</xref>], we focused our evaluation on LLMs currently available within ChatGPT. For the initial data collection, this included GPT-4o, GPT-4.1, GPT-4.1 mini, o3, o4 mini, and o4 mini high. For a second round of data collection, this included GPT-5.1 and GPT-5.2, including both the Instant and Thinking versions. All models are based on the Transformer architecture; however, o3, o4 mini, o4 mini high, GPT-5.1 Thinking, and GPT-5.2 Thinking include a reasoning process prior to generating output for users [<xref ref-type="bibr" rid="ref23">23</xref>]. All models were tested using the default parameters to approximate consumer-facing ChatGPT model behavior. Thus, we used each model&#x2019;s default temperature (1), a top-p of 1, did not specify a maximum output length (max_tokens unset), and did not specify a random seed. Because outputs are stochastic without a seed, we repeated each vignette-model-prompt condition 10 times and reported variability to approximate consumer-facing behavior. For GPT-5.1 and GPT-5.2, we set the reasoning.effort parameter to none (to disable reasoning and emulate consumer-facing GPT-5.1 and GPT-5.2 Instant), and to medium (to emulate GPT-5.1 and GPT-5.2 Thinking). Additionally, we conducted a sensitivity analysis using the 2 models for which the prompts yielded the largest accuracy gains. We tested a temperature of 0 (maximum determinism/control) and did not include a higher temperature option because the models frequently refused to provide recommendations with a temperature higher than the default temperature.</p></sec><sec id="s2-4"><title>Task and Evaluation Dataset</title><p>Our task consisted of obtaining advice on which care-seeking option is most appropriate for the described symptoms. This type of task is commonly used to evaluate both digital health applications and LLMs [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref73">73</xref>-<xref ref-type="bibr" rid="ref75">75</xref>]. We selected it specifically because it reflects a real use case for ChatGPT [<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref76">76</xref>-<xref ref-type="bibr" rid="ref80">80</xref>], involves uncertainty (ie, unknown risks), and often deals with ambiguous or incomplete data or symptoms. For these reasons, we considered this task well suited to test the influence of NDM-inspired prompts on LLM performance in real-world problems.</p><p>The dataset was developed in previous studies and followed current guidelines for evaluating care-seeking decision support systems [<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref81">81</xref>-<xref ref-type="bibr" rid="ref83">83</xref>]. From an &#x201C;ask the doctor&#x201D; online platform, 45 real patient cases, where medical laypeople described their symptoms and sought advice from professionals, were collected between October 2023 and January 2024 and psychometrically validated [<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref81">81</xref>]. Cases were further stratified to reflect the natural base rates of symptom types that are typically entered into online care-seeking advice tools based on the Centers for Disease Control and Prevention&#x2019;s National Ambulatory Medical Care Survey [<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref84">84</xref>]. Because of this stratification, the dataset included only 2 emergency care cases, 30 nonemergency care cases, and 13 self-care cases. To minimize editing effects, only typos were corrected. The original cases cannot be reproduced in this manuscript for copyright reasons, but they are available from the authors upon reasonable request. The cases describe acute symptoms for which laypeople seek decision support on whether and where to seek care. They cover a range of physical symptom presentations across specialties and are written in natural, nontechnical language from the perspective of medical laypeople. For example, 1 case describes a person reporting pain near the chest or lung area during the night when lying down, which improves when upright but worsens with breathing. After a few minutes, the person noticed a weird sensation, and the pain went away. The person then asks whether they should still seek medical care. Another exemplary case describes a person who notices a visible dent in the lower leg or heel area after sitting with one leg crossed for about 20 minutes. The person says this happens often and is usually painless, but this time the legs seem more swollen than usual, so the dent lasts longer.</p><p>To determine the most appropriate level of care, a physician panel consisting of 2 licensed physicians independently rated each case and then discussed their assessments until they reached a consensus assignment on 3 possible care-seeking options: (1) emergency care, defined as situations requiring emergency department visits; (2) nonemergency care, where consulting a general practitioner or another medical specialist is appropriate; and (3) self-care, where patients can monitor or manage symptoms themselves. Further details on the dataset are available in our publication on the RepVig framework [<xref ref-type="bibr" rid="ref70">70</xref>].</p><p>Since the prompts included a specific output format, we used text pattern&#x2013;based matching to extract the recommendation given by the LLM. If no direct match was found, we assessed the cases manually.</p></sec><sec id="s2-5"><title>Prompts</title><p>To evaluate the baseline accuracy of the tested models, we used the prompt shown in <xref ref-type="other" rid="box1">Textbox 1</xref>. This prompt was developed in previous studies evaluating the care-seeking advice of LLMs [<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref69">69</xref>,<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref85">85</xref>] and was adapted to specify an output structure (ie, &#x201C;Answer: Classification&#x201D;) that enabled us to classify model responses using natural language processing.</p><p>The NDM-inspired prompts were based on this base prompt but additionally included the name of the respective model (ie, &#x201C;Recognition-Primed Decision-Making model&#x201D; and &#x201C;Data-Frame-Theory approach&#x201D;) to prime the LLMs accordingly. Each prompt then provided instructions on how to reason following the principles of the corresponding model. The full prompts are shown in <xref ref-type="other" rid="box2">Textbox 2</xref>.</p><boxed-text id="box1"><title> Care-seeking advice prompt used in the control group to determine the base accuracy of the tested large language models.</title><p>Please give a triage classification for the following vignette. The criteria for triage classifications are presented below:</p><list list-type="bullet"><list-item><p>Emergency: Call 911 or go directly to the emergency room;</p></list-item><list-item><p>Nonemergency: Try to see a doctor, but the symptoms do not require immediate attention at an emergency room. For example, you can see a doctor in the next week.</p></list-item><list-item><p>Self-care: Let the health issue get better on its own and review the situation again in a few days.</p></list-item></list><p>Respond in the following format:Answer: Your answer [either Emergency, Nonemergency, or Self-Care].</p></boxed-text><boxed-text id="box2"><title> Recognition-primed prompt instructing the large language models to reason according to the recognition-primed decision-making model.</title><p>Please give a triage classification for the following vignette. The criteria for triage classifications are presented below:</p><list list-type="bullet"><list-item><p>Emergency: Call 911 or go directly to the emergency room;</p></list-item><list-item><p>Nonemergency: Try to see a doctor, but the symptoms do not require immediate attention at an emergency room. For example, you can see a doctor in the next week.</p></list-item><list-item><p>Self-care: Let the health issue get better on its own and review the situation again in a few days.</p></list-item></list><p>Use the recognition-primed decision-making model to make your decision. Does this situation match any typical cases you know? If yes, what is the usual decision for such a case? Simulate implementing this decision for the described situation and test whether it will work. If not, modify it and test again whether it will work. If the situation is not similar to any typical case you know, try to reassess the situation until you think it sounds familiar, simulate the implementation again, and test whether it will work.</p><p>Respond in the following format:Analysis: Your analysis || Answer: Your answer [either Emergency, Nonemergency, or Self-Care].</p></boxed-text><boxed-text id="box3"><title> Data-frame prompt instructing the large language models to reason according to the data-frame theory.</title><p>Please give a triage classification for the following vignette. The criteria for triage classifications are presented below:</p><list list-type="bullet"><list-item><p>Emergency: Call 911 or go directly to the emergency room;</p></list-item><list-item><p>Nonemergency: Try to see a doctor, but the symptoms do not require immediate attention at an emergency room. For example, you can see a doctor in the next week.</p></list-item><list-item><p>Self-care: Let the health issue get better on its own and review the situation again in a few days.</p></list-item></list><p>Before giving your triage classification, think about the correct classification using the Data-Frame-Theory approach. As you analyze the vignette, actively use the following reasoning processes (as needed, not necessarily in order):</p><list list-type="bullet"><list-item><p>Construct or recognize a frame: Identify the main interpretation or mental model that organizes the case information.</p></list-item><list-item><p>Elaborate the frame: Seek out or infer additional relevant details from the vignette.</p></list-item><list-item><p>Question the frame: Look for inconsistencies, surprising data, or violated expectations.</p></list-item><list-item><p>Preserve the frame: Consider whether your interpretation still fits, or if any data needs to be reinterpreted.</p></list-item><list-item><p>Seek a new frame: If appropriate, consider alternative interpretations.</p></list-item><list-item><p>Reframe: Revise your perspective and reinterpret the data if needed.</p></list-item><list-item><p>Compare frames: Identify and weigh alternative ways of understanding the case.</p></list-item></list><p>Respond in the following format:Reflection process: Your reflection || Answer: Your answer [either Emergency, Nonemergency, or Self-Care].</p></boxed-text><p>All prompts were tested for feasibility in a pretest, during which the authors tested the prompts and API calls using random cases and manually assessed the output for adherence to the instructions and correct formatting.</p></sec><sec id="s2-6"><title>Procedure</title><p>We used a custom-built Python script to access the OpenAI API on May 23, 2025, and a second time for newer models on February 23, 2026. For each model, the prompts (<xref ref-type="other" rid="box1">Textboxes 1-3</xref>) were entered as system prompts, and the case vignettes as user prompts. The context window was cleared before every call. Because of the high output variability observed in LLMs [<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref86">86</xref>,<xref ref-type="bibr" rid="ref87">87</xref>], we tested each model on each case 10 times as a quality-management measure to account for the fact that different users may receive different advice for the same input. Model outputs were then classified automatically in R into 3 categories (emergency, nonemergency, self-care). For cases in which the category could not be determined through keyword or pattern matching (n=61), manual coding was performed by reading through the answer and assigning a classification manually.</p></sec><sec id="s2-7"><title>Outcome Measures</title><p>The primary outcome was classification accuracy, defined as whether the model&#x2019;s triage recommendation matched the physician-panel gold standard reference for each vignette (ie, correct or incorrect). This metric was chosen because it most closely measures the potential behavioral and safety impact the prompts can have on users. Secondary outcomes included the accuracy by each triage level (emergency, nonemergency, or self-care), calculated as the proportion of correct recommendations within each stratum. Additionally, because the vignette set included only 2 emergency cases, we dichotomized the triage levels into 2 groups: requiring medical care versus self-care. Next, we assessed output variability using Fleiss&#x2019; Kappa for each model-vignette-prompt combination, and by assessing the consistency of model recommendations, that is, the proportion of the 10 trials corresponding to the most frequently given recommendation. Lastly, we assessed technical accuracy by coding whether the correct recommendation was given at least once among all 10 trials.</p></sec><sec id="s2-8"><title>Data Analysis</title><p>All analyses were conducted in R using the packages symptomcheckR, tidyverse, psych, and lme4 [<xref ref-type="bibr" rid="ref88">88</xref>-<xref ref-type="bibr" rid="ref91">91</xref>]. To assess the accuracy of each prompt, we calculated the mean proportion of correctly solved cases and quantified precision using 95% CIs. To test our hypothesis that the NDM-inspired prompts increase LLM performance, we used mixed effects binomial logistic regression (with prompt type as a fixed effect and random intercepts for model, vignette, and model-by-vignette combination to account for repeated observations and clustering in our data&#x2014;that is, for having each model assess each vignette 10 times) with 2-sided tests. Additionally, we conducted subgroup analyses to assess accuracy in dichotomized decisions (ie, professional care vs self-care), accuracy by model, and accuracy by each care-seeking level. In sensitivity analyses, we further tested whether the reported results remained stable with a low-temperature setting. We chose the 2 models with the highest and lowest prompt-dependent accuracy improvement (ie, GPT-4.1 mini and GPT-5.2 Instant).</p><p>To quantify output variability for each prompt, vignette, and model combination, we calculated Fleiss &#x03BA; and recorded the frequency with which the most common recommendation was given across the 10 trials for each vignette and model. As an estimate of &#x201C;technical accuracy&#x201D; (ie, whether the model was technically capable of generating the correct advice), we noted whether the correct recommendation was given at least once in 10 trials [<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref92">92</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Assessments</title><p>We used 45 vignettes to test 10 models, with each model run 10 times per vignette using 3 prompting strategies (default, recognition-primed prompting, and data-frame prompting). This resulted in a total of 13,500 individual assessments.</p></sec><sec id="s3-2"><title>Overall Accuracy of Each Prompt</title><p>The average accuracy across all models, vignettes, and trials was 63.3% (95% CI 61.9%&#x2010;64.7%) for the default prompt, 67.6% (95% CI 66.3%&#x2010;69%) for the recognition-primed prompt, and 66.7% (95% CI 65.3%&#x2010;68%) for the data-frame prompt. Both the recognition-primed prompt (OR 2.26, <italic>z</italic>=8.69; <italic>P</italic>&#x003C;.001) and the data-frame prompt (OR 2.05, <italic>z</italic>=7.23; <italic>P</italic>&#x003C;.001) significantly increased accuracy compared to the default prompt. Improvements were greater for reasoning models than for nonreasoning models (OR 2.15, <italic>z</italic>=4.05, <italic>P</italic>&#x003C;.001 for the recognition-primed prompt and OR 1.70, <italic>z</italic>=2.65, <italic>P</italic>=.008 for the data-frame prompt). The largest increase in accuracy compared to the default prompt was observed for GPT-4.1 mini with the data-frame prompt, with an improvement of 13 percentage points (95% CI 9.7&#x2010;16.1), as shown in <xref ref-type="table" rid="table1">Table 1</xref>. In a binary choice, that is, care versus self-care, results remained similar, as shown in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The same holds true when tested with a low-temperature setting, as shown in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>The accuracy of all tested models for each prompt.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Default prompt, mean (95% CI)</td><td align="left" valign="bottom">Recognition-primed prompt, mean (95% CI)</td><td align="left" valign="bottom">Data-frame prompt, mean (95% CI)</td><td align="left" valign="bottom">Model type</td></tr></thead><tbody><tr><td align="left" valign="top">Overall (%)</td><td align="left" valign="top">63.3 (61.9&#x2010;64.7)</td><td align="left" valign="top">67.6 (66.3 &#x2010;69)</td><td align="left" valign="top">66.7 (65.3&#x2010;68)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">GPT-4o (%)</td><td align="left" valign="top">65.3 (60.8&#x2010;69.6)</td><td align="left" valign="top">70.7 (66.3&#x2010;74.7)</td><td align="left" valign="top">66.2 (61.7&#x2010;70.4)</td><td align="left" valign="top">Nonreasoning</td></tr><tr><td align="left" valign="top">GPT-4.1 (%)</td><td align="left" valign="top">64.4 (59.9&#x2010;68.7)</td><td align="left" valign="top">72.7 (68.4&#x2010;76.6)</td><td align="left" valign="top">72.4 (68.1&#x2010;76.4)</td><td align="left" valign="top">Nonreasoning</td></tr><tr><td align="left" valign="top">GPT-4.1 mini (%)</td><td align="left" valign="top">49.8 (45.2&#x2010;54.4)</td><td align="left" valign="top">60.7 (56.1&#x2010;65.1)</td><td align="left" valign="top">62.4 (57.9&#x2010;66.8)</td><td align="left" valign="top">Nonreasoning</td></tr><tr><td align="left" valign="top">o3 (%)</td><td align="left" valign="top">70.7 (66.3&#x2010;74.7)</td><td align="left" valign="top">75.1 (70.9&#x2010;78.9)</td><td align="left" valign="top">76.2 (72.1&#x2010;79.9)</td><td align="left" valign="top">Reasoning</td></tr><tr><td align="left" valign="top">o4 mini (%)</td><td align="left" valign="top">69.3 (64.9&#x2010;73.4)</td><td align="left" valign="top">70.7 (66.3&#x2010;74.7)</td><td align="left" valign="top">72.9 (68.6&#x2010;76.8)</td><td align="left" valign="top">Reasoning</td></tr><tr><td align="left" valign="top">o4 mini high (%)</td><td align="left" valign="top">68.9 (64.5&#x2010;73)</td><td align="left" valign="top">71.3 (67&#x2010;75.3)</td><td align="left" valign="top">70.7 (66.3&#x2010;74.7)</td><td align="left" valign="top">Reasoning</td></tr><tr><td align="left" valign="top">GPT-5.1 Instant (%)</td><td align="left" valign="top">64 (59.4&#x2010;68.4)</td><td align="left" valign="top">71.1 (66.7&#x2010;75.3)</td><td align="left" valign="top">66.2 (61.6&#x2010;70.6)</td><td align="left" valign="top">Nonreasoning</td></tr><tr><td align="left" valign="top">GPT-5.1 Thinking (%)</td><td align="left" valign="top">70.7 (66.2&#x2010;74.8)</td><td align="left" valign="top">74.7 (70.4&#x2010;78.6)</td><td align="left" valign="top">72.4 (68.1&#x2010;76.5)</td><td align="left" valign="top">Reasoning</td></tr><tr><td align="left" valign="top">GPT-5.2 Instant (%)</td><td align="left" valign="top">57.8 (53.1&#x2010;62.4)</td><td align="left" valign="top">56.4 (51.7&#x2010;61.1)</td><td align="left" valign="top">55.3 (50.6&#x2010;60)</td><td align="left" valign="top">Nonreasoning</td></tr><tr><td align="left" valign="top">GPT-5.2 Thinking (%)</td><td align="left" valign="top">52 (47.3&#x2010;56.7)</td><td align="left" valign="top">53.1 (48.4&#x2010;57.8)</td><td align="left" valign="top">51.8 (47.1&#x2010;56.5)</td><td align="left" valign="top">Reasoning</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>The recognition-primed prompt increased accuracy in 18 of 45 cases (40%) and decreased accuracy in 13 cases (29%) on average across all models. Its median increase in accuracy was 18% (IQR 5%&#x2010;24%), whereas the median decrease was 5% (IQR 2%&#x2010;7%). The data-frame prompt increased accuracy in 17 of 45 (38%) cases and reduced it in 12 (27%) cases. Its median increase in accuracy was 14% (IQR 4%&#x2010;19%), and the median decrease was 5% (IQR 3%&#x2010;8%). Most decreases in accuracy affected nonemergency cases (12/13, 92% for the recognition-primed prompt and 11/12, 92% for the data-frame prompt, see the Triage-Level Accuracy of Each Prompt section for the direction). Increases were observed in both nonemergency and self-care cases (8/18, 44%, and 10/18, 56%, respectively, for the recognition-primed prompt; 8/17, 47%, and 9/17, 53%, for the data-frame prompt, see the next section for the direction) (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-3"><title>Triage-Level Accuracy of Each Prompt</title><p>Across all 3 prompts, the models tended to recommend higher-than-necessary urgency (accounting for 88% of all errors, 95% CI 87%&#x2010;88.9%) rather than lower-than-necessary urgency (12% of all errors, 95% CI 11.1%&#x2010;13%). With the default prompt, both emergency cases were correctly identified (100%, 95% CI 98.2%&#x2010;100%). Using both the data-frame prompt and the recognition-primed prompt, both emergency cases were also mostly identified correctly (99%, 95% CI 96.4%&#x2010;99.9% and 98%, 95% CI 95%&#x2010;99.5%, respectively), although some trials resulted in incorrect nonemergency advice (1%, 95% CI 0.1%&#x2010;3.6% and 2%, 95% CI 0.5%&#x2010;5%, respectively). Accuracy for nonemergency cases was similar across all prompts: 82.5% (95% CI 81.1%&#x2010;83.8%) for the default prompt, 82% (95% CI 80.5%&#x2010;83.3%) for the recognition-primed prompt, and 82.8% (95% CI 81.4%&#x2010;84.1%) for the data-frame prompt. The largest difference was observed for self-care cases: With the default prompt, the models correctly identified only 13.4% (95% CI 11.6%&#x2010;15.4%), compared to 29.8% (95% CI 27.3%&#x2010;32.3%) with the recognition-primed prompt and 24.6% (95% CI 22.3%&#x2010;27.1%) with the data-frame prompt (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The results remained similar in a binary choice task and also when tested with a low temperature setting (Tables S1 and S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Confusion matrix showing the classification of each prompt across all models compared to the correct vignette solution. Emergency estimates may be unreliable because only 2 cases were included.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="biomedeng_v11i1e88053_fig01.png"/></fig><p>Notably, nonreasoning models that never or rarely provided self-care advice with the default prompt began providing self-care advice with relatively high accuracy when using the NDM-inspired prompts (eg, 0%, 95% CI 0%&#x2010;2.9% for the default prompt in GPT-4.1, compared to 43.8%, 95% CI 35.6%&#x2010;52.4% for the recognition-primed prompt and 39.2%, 95% CI 31.3%&#x2010;47.8% for the data-frame prompt). For reasoning models, which already gave self-care advice with the default prompt, accuracy further improved with the NDM-inspired prompts (eg, 46.9%, 95% CI 38.6%&#x2010;55.5% with the default prompt in o4 mini; 63.8%, 95% CI 55.3%&#x2010;71.6% with the recognition-primed prompt; and 56.2%, 95% CI 47.6%&#x2010;64.4% with the data-frame prompt; <xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Accuracy of each model and prompt in generating care-seeking advice by correct vignette solution.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and vignette type</td><td align="left" valign="bottom">Default prompt, mean (95% CI)</td><td align="left" valign="bottom">Recognition-primed prompt, mean (95% CI)</td><td align="left" valign="bottom">Data-frame prompt, mean (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">GPT-4o (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">90.3 (86.5&#x2010;93.2)</td><td align="left" valign="top">92.3 (88.8&#x2010;94.8)</td><td align="left" valign="top">87.7 (83.5&#x2010;90.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">2.3 (0.8&#x2010;6.6)</td><td align="left" valign="top">16.2 (10.8&#x2010;23.4)</td><td align="left" valign="top">11.5 (7.1&#x2010;18.2)</td></tr><tr><td align="left" valign="top" colspan="4">GPT-4.1 (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">80 (58.4&#x2010;91.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">90 (86.1&#x2010;92.9)</td><td align="left" valign="top">83.3 (78.7&#x2010;87.1)</td><td align="left" valign="top">86.3 (82&#x2010;89.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">0 (0&#x2010;2.9)</td><td align="left" valign="top">43.8 (35.6&#x2010;52.4)</td><td align="left" valign="top">39.2 (31.3&#x2010;47.8)</td></tr><tr><td align="left" valign="top" colspan="4">GPT-4.1 mini (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">68 (62.5&#x2010;73)</td><td align="left" valign="top">81.3 (76.5&#x2010;85.3)</td><td align="left" valign="top">87 (82.7&#x2010;90.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-Care</td><td align="left" valign="top">0 (0&#x2010;2.9)</td><td align="left" valign="top">6.9 (3.7&#x2010;12.6)</td><td align="left" valign="top">0 (0&#x2010;2.9)</td></tr><tr><td align="left" valign="top" colspan="4">o3 (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">93.7 (90.3&#x2010;95.9)</td><td align="left" valign="top">90.7 (86.8&#x2010;93.5)</td><td align="left" valign="top">92.7 (89.1&#x2010;95.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">13.1 (8.3&#x2010;19.9)</td><td align="left" valign="top">35.4 (27.7&#x2010;43.9)</td><td align="left" valign="top">34.6 (27&#x2010;43.1)</td></tr><tr><td align="left" valign="top" colspan="4">o4 mini (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">77 (71.9&#x2010;81.4)</td><td align="left" valign="top">71.7 (66.3&#x2010;76.5)</td><td align="left" valign="top">78.3 (73.3&#x2010;82.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">46.9 (38.6&#x2010;55.5)</td><td align="left" valign="top">63.8 (55.3&#x2010;71.6)</td><td align="left" valign="top">56.2 (47.6&#x2010;64.4)</td></tr><tr><td align="left" valign="top" colspan="4">o4 mini high (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">75.7 (70.5&#x2010;80.2)</td><td align="left" valign="top">74.7 (69.5&#x2010;79.3)</td><td align="left" valign="top">74 (68.8&#x2010;78.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">48.5 (40&#x2010;57)</td><td align="left" valign="top">59.2 (50.6&#x2010;67.3)</td><td align="left" valign="top">58.5 (49.9&#x2010;66.6)</td></tr><tr><td align="left" valign="top" colspan="4">GPT-5.1 Instant (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">95 (76.4&#x2010;99.1)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">87 (82.7&#x2013;90.3)</td><td align="left" valign="top">86 (81.6&#x2010;89.5)</td><td align="left" valign="top">88 (83.8&#x2010;91.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">5.4 (2.6&#x2010;10.7)</td><td align="left" valign="top">33.1 (25.6&#x2010;41.5)</td><td align="left" valign="top">10.8 (6.5&#x2010;17.3)</td></tr><tr><td align="left" valign="top" colspan="4">GPT-5.1 Thinking (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">95 (76.4&#x2010;99.1)</td><td align="left" valign="top">100% (83.9%&#x2010;100%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">92 (88.4&#x2010;94.6)</td><td align="left" valign="top">90 (86.1&#x2010;92.9)</td><td align="left" valign="top">87 (82.7&#x2010;90.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">16.9 (11.4&#x2010;24.3)</td><td align="left" valign="top">36.2 (28.4&#x2010;44.7)</td><td align="left" valign="top">34.6 (27&#x2010;43.1)</td></tr><tr><td align="left" valign="top" colspan="4">GPT-5.2 Instant (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">80 (75.1&#x2010;84.1)</td><td align="left" valign="top">76.7 (71.6&#x2010;81.1)</td><td align="left" valign="top">76.3 (71.2&#x2010;80.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">0 (0&#x2010;2.9)</td><td align="left" valign="top">3.1 (1.2&#x2010;7.6)</td><td align="left" valign="top">0 (0&#x2010;2.9)</td></tr><tr><td align="left" valign="top" colspan="4">GPT-5.2 Thinking (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Emergency (n=2)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td><td align="left" valign="top">100 (83.9&#x2010;100)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonemergency</td><td align="left" valign="top">71 (65.6&#x2010;75.8)</td><td align="left" valign="top">73 (67.7&#x2010;77.7)</td><td align="left" valign="top">70.7 (65.3&#x2010;75.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-care</td><td align="left" valign="top">0.8 (0.1&#x2010;4.2)</td><td align="left" valign="top">0 (0&#x2010;2.9)</td><td align="left" valign="top">0.8 (0.1&#x2010;4.2)</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>Output Variability of Each Prompting Technique</title><p>Intertrial reliability&#x2014;that is, the frequency with which a vignette received the same advice from the same model across multiple trials&#x2014;was comparable across all prompts, with median Fleiss &#x03BA; values of 0.766 (IQR 0.706&#x2010;0.884) for the default prompt, 0.717 (IQR 0.681&#x2010;0.743) for the recognition-primed prompt, and 0.751 (IQR 0.725&#x2010;0.773) for the data-frame prompt (<xref ref-type="table" rid="table3">Table 3</xref>). The results remained similar when tested with a low temperature setting (Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Intertrial reliability (Fleiss &#x03BA;) of each model for each prompt.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Default prompt</td><td align="left" valign="bottom">Recognition-primed prompt</td><td align="left" valign="bottom">Data-frame prompt</td></tr></thead><tbody><tr><td align="left" valign="top">Overall, median (IQR)</td><td align="left" valign="top">0.766 (0.706&#x2010;0.884)</td><td align="left" valign="top">0.717 (0.681&#x2010;0.743)</td><td align="left" valign="top">0.751 (0.725&#x2010;0.773)</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">0.708</td><td align="left" valign="top">0.710</td><td align="left" valign="top">0.718</td></tr><tr><td align="left" valign="top">GPT-4.1</td><td align="left" valign="top">0.963</td><td align="left" valign="top">0.777</td><td align="left" valign="top">0.658</td></tr><tr><td align="left" valign="top">GPT-4.1 mini</td><td align="left" valign="top">0.824</td><td align="left" valign="top">0.634</td><td align="left" valign="top">0.756</td></tr><tr><td align="left" valign="top">o3</td><td align="left" valign="top">0.706</td><td align="left" valign="top">0.675</td><td align="left" valign="top">0.807</td></tr><tr><td align="left" valign="top">o4 mini</td><td align="left" valign="top">0.707</td><td align="left" valign="top">0.699</td><td align="left" valign="top">0.766</td></tr><tr><td align="left" valign="top">o4 mini high</td><td align="left" valign="top">0.701</td><td align="left" valign="top">0.743</td><td align="left" valign="top">0.744</td></tr><tr><td align="left" valign="top">GPT-5.1 Instant</td><td align="left" valign="top">0.895</td><td align="left" valign="top">0.644</td><td align="left" valign="top">0.620</td></tr><tr><td align="left" valign="top">GPT-5.1 Thinking</td><td align="left" valign="top">0.689</td><td align="left" valign="top">0.723</td><td align="left" valign="top">0.745</td></tr><tr><td align="left" valign="top">GPT-5.2 Instant</td><td align="left" valign="top">0.926</td><td align="left" valign="top">0.742</td><td align="left" valign="top">0.775</td></tr><tr><td align="left" valign="top">GPT-5.2 Thinking</td><td align="left" valign="top">0.851</td><td align="left" valign="top">0.821</td><td align="left" valign="top">0.839</td></tr></tbody></table></table-wrap><p>When considering the most frequently given recommendation for each vignette by each model, all prompts yielded relatively consistent advice across multiple trials (mean 76.9%, 95% CI 72.7%&#x2010;80.7% for the default prompt; mean 66.9%, 95% CI 62.3%&#x2010;71.2% for the recognition-primed prompt; mean 71.1%, 95% CI 66.7%&#x2010;75.3% for the data-frame prompt), as shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. There was no statistically significant difference between the prompts in how often a specific option was recommended across trials (<italic>z</italic>=1.50, <italic>P</italic>=.13 for the recognition-primed prompt; <italic>z</italic>=0.72, <italic>P</italic>=.47 for the data-frame prompt).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Number of times the most frequently advised recommendation was given among the 3 prompts.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="biomedeng_v11i1e88053_fig02.png"/></fig><p>However, the tested models were more likely to provide the correct solution at least once across multiple trials when using the recognition-primed prompt (mean 82.2%, 95% CI 78.4%&#x2010;85.6%) and the data-frame prompt (mean 78.4%, 95% CI 74.4%&#x2010;82.2%) compared to the default prompt (mean 73.1%, 95% CI 68.8%&#x2010;77.2%) (<xref ref-type="table" rid="table4">Table 4</xref>). The results remained similar when tested with a low-temperature setting (Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Percentage of cases that were solved correctly at least once among 10 trials.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Default prompt, mean (95% CI)</td><td align="left" valign="bottom">Recognition-primed prompt, mean (95% CI)</td><td align="left" valign="bottom">Data-frame prompt, mean (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Overall (%)</td><td align="left" valign="top">73.1 (68.8&#x2010;77.2)</td><td align="left" valign="top">82.2 (78.4&#x2010;85.6)</td><td align="left" valign="top">78.4 (74.4&#x2010;82.2)</td></tr><tr><td align="left" valign="top">GPT-4o (%)</td><td align="left" valign="top">77.8 (62.9&#x2010;88.8)</td><td align="left" valign="top">84.4 (70.5&#x2010;93.5)</td><td align="left" valign="top">80 (65.4&#x2010;90.4)</td></tr><tr><td align="left" valign="top">GPT-4.1 (%)</td><td align="left" valign="top">66.7 (51&#x2010;80)</td><td align="left" valign="top">84.4 (70.5&#x2010;93.5)</td><td align="left" valign="top">84.4 (70.5&#x2010;93.5)</td></tr><tr><td align="left" valign="top">GPT-4.1 mini (%)</td><td align="left" valign="top">55.6 (40&#x2010;70.4)</td><td align="left" valign="top">75.6 (60.5&#x2010;87.1)</td><td align="left" valign="top">66.7 (51&#x2010;80)</td></tr><tr><td align="left" valign="top">o3 (%)</td><td align="left" valign="top">82.2 (67.9&#x2010;92)</td><td align="left" valign="top">91.1 (78.8&#x2010;97.5)</td><td align="left" valign="top">84.4 (70.5&#x2010;93.5)</td></tr><tr><td align="left" valign="top">o4 mini (%)</td><td align="left" valign="top">86.7 (73.2&#x2010;94.9)</td><td align="left" valign="top">88.9 (75.9&#x2010;96.3)</td><td align="left" valign="top">86.7 (73.2&#x2010;94.9)</td></tr><tr><td align="left" valign="top">o4 mini high (%)</td><td align="left" valign="top">91.1 (78.8&#x2010;97.5)</td><td align="left" valign="top">88.9 (75.9&#x2010;96.3)</td><td align="left" valign="top">86.7 (73.2&#x2010;94.9)</td></tr><tr><td align="left" valign="top">GPT-5.1 Instant (%)</td><td align="left" valign="top">68.9 (53.4&#x2010;81.8)</td><td align="left" valign="top">86.7 (73.2&#x2010;94.9)</td><td align="left" valign="top">82.2 (67.9&#x2010;92)</td></tr><tr><td align="left" valign="top">GPT-5.1 Thinking (%)</td><td align="left" valign="top">82.2 (67.9&#x2010;92)</td><td align="left" valign="top">88.9 (75.9&#x2010;96.3)</td><td align="left" valign="top">86.7 (73.2&#x2010;94.9)</td></tr><tr><td align="left" valign="top">GPT-5.2 Instant (%)</td><td align="left" valign="top">60 (44.3&#x2010;74.3)</td><td align="left" valign="top">73.3 (58.1&#x2010;85.4)</td><td align="left" valign="top">64.4 (48.8&#x2010;78.1)</td></tr><tr><td align="left" valign="top">GPT-5.2 Thinking (%)</td><td align="left" valign="top">60 (44.3&#x2010;74.3)</td><td align="left" valign="top">60 (44.3&#x2010;74.3)</td><td align="left" valign="top">62.2 (46.5&#x2010;76.2)</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>Our study investigated whether prompting strategies inspired by NDM&#x2014;a field that analyzes how humans make real-world decisions under uncertainty&#x2014;can improve LLM performance in ill-defined tasks such as care-seeking decisions. Our results show that both the recognition-primed and the data-frame prompts increased the accuracy of care-seeking advice across all tested models except GPT-5.2. Although this effect may partly reflect the additional reasoning process before producing an answer among nonreasoning models, we observed improvements not only in nonreasoning models but also in reasoning models that already include a reasoning process. This observation suggests that our results cannot simply be attributed to a general reasoning process. Notably, most nonreasoning models with NDM-inspired prompts outperformed traditional reasoning models using the default prompt, and reasoning models also showed significant improvements with the NDM-inspired prompts.</p><p>The greatest improvements due to the NDM-inspired prompts were seen in self-care cases, which were more often correctly identified. Nonreasoning models rarely or never provided self-care advice with the default prompt, a finding consistent with previous studies [<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref69">69</xref>,<xref ref-type="bibr" rid="ref70">70</xref>]. When prompted with the NDM-inspired prompts, these models began giving self-care advice and even reached a relatively high level of accuracy, up to 44%. In contrast, accuracy for the 2 included emergency cases and the included nonemergency cases showed little change, likely due to a ceiling effect, as the tested models were already highly accurate on these cases with the default prompt. Prior research suggests that self-care advice is typically given by LLMs only when a reasoning process is included and that the tendency toward risk-averse recommendations may stem from built-in safety measures [<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref69">69</xref>]. The recognition-primed prompt explicitly instructs the model to recall similar situations (pattern matching according to the RPD model) and to forecast possible outcomes (mental simulation), which may help the model reconsider overly cautious recommendations before giving advice. Similarly, the data-frame prompt encourages the model to re-examine each initial recommendation and&#x2014;if new data do not fit the initial frame&#x2014;explore alternative frames, which may help identify when self-care is sufficient rather than defaulting to medical referral.</p><p>OpenAI&#x2019;s most recent GPT-5 model family includes an updated reasoning process [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. The benefits of using NDM-inspired prompts were replicated for GPT-5.1 (for both the Instant version without a reasoning process and the Thinking version with a reasoning process), but not for GPT-5.2: self-care accuracy dropped to 0% in both GPT-5.2 Instant and GPT-5.2 Thinking and remained unchanged with NDM-inspired prompts. These results may suggest a version-level shift toward recommending professional care that prompting does not alter. This observation is unlikely to be attributable to changes in the reasoning mechanism alone, because GPT-5.1&#x2014;despite also using the updated reasoning process&#x2014;did not show the same decrease in self-care accuracy.</p></sec><sec id="s4-2"><title>Implications</title><p>The present findings have implications for prompt engineering, artificial intelligence (AI) research, and end users. First, for prompt engineering, we suggest that, rather than relying solely on prompts built on computer science (eg, ensemble methods and decomposing), strategies derived from cognitive science, applied psychology, and HF/E&#x2014;especially those based on models of human decision-making under uncertainty&#x2014;may be more effective or, at least, serious competitors, particularly in domains with high ambiguity and uncertainty, such as triage or diagnostic decisions. In these ill-defined situations, we showed that a &#x201C;reasoning blueprint&#x201D; based on human cognition can outperform methods that simply instruct the models to reason. We acknowledge, however, that, based on our results, the benefits of NDM-inspired prompts are thus far limited to uncertain tasks. It remains to be seen how they perform on more well-defined tasks, such as text formatting or summarization.</p></sec><sec id="s4-3"><title>Limitations</title><p>Although our results show a positive impact of combining NDM with prompt engineering, there are several limitations. First, we conducted a single benchmarking test within one domain, that is, care-seeking advice. Although this is a typical real-world decision task with high uncertainty and a common use case for LLMs [<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref76">76</xref>-<xref ref-type="bibr" rid="ref80">80</xref>], it remains unclear whether our findings generalize to other tasks or domains with varying levels of ambiguity and/or uncertainty. Second, the low sample size for emergency cases leads to unstable accuracy estimates, and no safety conclusions should be derived from the data presented here. Third, we limited our evaluation to LLMs that are currently integrated into ChatGPT. We made this decision to assess practical impact for users; however, it is unclear whether these results would hold true for the broader range of LLMs available or in development. In particular, future research should test whether similar results can be achieved with smaller models and limited context windows, given that the reasoning process increases token requirements.</p><p>The NDM-inspired prompts themselves present another limitation. These prompts are computationally more expensive to run than standard user inputs because they add a reasoning output before giving advice. Although this may not affect individual users, it could increase operational costs for developers integrating such prompts, especially compared to nonreasoning models. We recommend that any potential performance gains from NDM-inspired prompting be carefully weighed against increased costs on a case-by-case basis.</p><p>Next, we did not include participants who interacted with the LLMs directly. Instead, we used a highly controlled setup in which each model was prompted repeatedly using standardized prompts. In real-world use, however, users&#x2019; prompts vary substantially in both content and quality [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref93">93</xref>]. Accordingly, the present study was designed to test whether NDM-inspired prompts can improve model accuracy under controlled conditions; we cannot infer that these prompts would translate into improved user decisions or higher-quality outputs in everyday use. This work should, therefore, be interpreted as a technical evaluation of model behavior under controlled inputs rather than as a clinical validation study. Depending on the intended use, LLMs may be regulated as Software as a Medical Device and may, therefore, require additional evidence that is outside the scope of the present study. Recent work on differing user inputs and adversarial attacks in chatbots shows that they can produce unsafe outputs depending on the specific prompts, which further demonstrates that more rigorous and use-case-specific safety evaluations are needed before deployment [<xref ref-type="bibr" rid="ref94">94</xref>,<xref ref-type="bibr" rid="ref95">95</xref>]. Future studies should therefore conduct user studies to examine whether NDM-inspired prompts also yield better recommendations and decision support for users in real-world settings, and to determine how NDM-inspired prompts may be used to prevent adversarial attacks.</p><p>Finally, the prompts tested here were based on only 2 decision-making models. There are other models that could serve as inspiration for prompt development, such as the decision ladder or heuristic decision models [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref96">96</xref>]. Moreover, domain-specific decision-making models may be even better suited for certain use cases. For care-seeking advice, no such model currently exists to explain how humans make these decisions. However, the development of such a model could be helpful to develop even more targeted prompting strategies to further increase LLM performance.</p></sec><sec id="s4-4"><title>Future Research</title><p>This study is among the first to combine NDM and AI-based decision support systems to foster more naturalistic decision support. Our findings provide a foundation for future work by demonstrating that real-world human reasoning strategies can improve the accuracy of LLMs. Building on these results, future work could examine how NDM and AI can be combined to support users. For example, prompting LLMs to use reasoning processes that reflect human decision-making could open a new direction for explainable AI. Unlike traditional explainable AI methods that focus on feature importance, providing explanations based on human-like pattern recognition and mental simulation may increase trust and help users identify potential mistakes in the reasoning process. Prior research has shown that users critically assess, rather than blindly follow, AI advice [<xref ref-type="bibr" rid="ref4">4</xref>]. Giving users an NDM-inspired reasoning approach may support this evaluation more than providing advice with a post hoc explanation.</p><p>NDM-inspired prompts may also improve human-AI collaboration: When humans and AI share a conceptual language (consisting of frames, pattern matching, and mental simulation), it may become easier for users to integrate AI advice into their own reasoning. For example, physicians could review the frames used by the LLM, add new data points, and let the AI simulate whether these fit the frame. Conversely, the AI could make predictions based on its frame, which the physician can cross-check with clinical data. An AI would thus not only give a final recommendation but also provide support in hypothesis generation, data gathering, and hypothesis testing [<xref ref-type="bibr" rid="ref97">97</xref>,<xref ref-type="bibr" rid="ref98">98</xref>].</p><p>Next, NDM-inspired prompting could also be used for education and training. LLMs could serve as interactive tools for medical students, allowing them to practice decision-making using the RPD model alongside the AI by comparing their mental simulations with those of the model. The AI could then provide feedback on differences in their respective frames.</p><p>More broadly, future work should move beyond technical benchmarking toward evaluation designs aligned with Software as a Medical Device expectations by predefining the intended use case and testing performance and safety prospectively in real-world settings. In this context, NDM may be treated as a theoretical basis for uncertainty management, and future studies can test whether NDM-based prompts reduce failures across different user inputs and adversarial attacks.</p></sec><sec id="s4-5"><title>Conclusions</title><p>In this study, we showed that applying models from NDM to prompt LLMs can improve performance in highly uncertain and ambiguous care-seeking tasks. Both NDM-inspired prompts tested here increased overall accuracy across both reasoning and nonreasoning models, with the greatest improvement in self-care recommendations, while maintaining high accuracy in the 2 included emergency cases and all included nonemergency cases. These findings may open up a new strategy for prompt engineering: rather than relying on prompts derived from computer science, prompts that build on NDM models or related models from applied psychology and HF/E, which represent how humans make sense of uncertainty, may be more effective in ill-defined tasks. As LLMs and other AI tools are increasingly adopted in safety-critical and everyday applications, NDM-inspired prompting may offer a strategy for making AI more useful for real-world decision-making.</p></sec></sec></body><back><ack><p>The authors declare the use of generative artificial intelligence (GAI) in the research and writing process. According to the GAIDeT (Generative AI Delegation Taxonomy; 2025), the following tasks were delegated to GAI tools under full human supervision: proofreading and editing. The GAI tool used was ChatGPT, model GPT-4.1. Responsibility for the final manuscript lies entirely with the authors. GAI tools are not listed as authors and do not bear responsibility for the final outcomes.</p></ack><notes><sec><title>Funding</title><p>The authors declared no financial support was received for this work.</p></sec><sec><title>Data Availability</title><p>The data can be accessed via Zenodo [<xref ref-type="bibr" rid="ref99">99</xref>].</p></sec></notes><fn-group><fn fn-type="conflict"><p>MK is an associate editor for <italic>JMIR Public Health and Surveillance</italic>.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">HF/E</term><def><p>human factors and ergonomics</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">NDM</term><def><p>naturalistic decision-making</p></def></def-item><def-item><term id="abb5">RPD</term><def><p>recognition-primed decision-making</p></def></def-item><def-item><term id="abb6">TRIPOD</term><def><p>Transparent Reporting of a Multivariable Model for Individual Prognosis or Diagnosis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jo</surname><given-names>E</given-names> </name><name name-style="western"><surname>Song</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>Assessing GPT-4&#x2019;s performance in delivering medical advice: comparative analysis with human experts</article-title><source>JMIR Med Educ</source><year>2024</year><month>07</month><day>8</day><volume>10</volume><fpage>e51282</fpage><pub-id pub-id-type="doi">10.2196/51282</pub-id><pub-id pub-id-type="medline">38989848</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>22</day><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="medline">37606976</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kisvarday</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yarahuan</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT use among pediatric health care providers: cross-sectional survey study</article-title><source>JMIR Form Res</source><year>2024</year><month>09</month><day>12</day><volume>8</volume><fpage>e56797</fpage><pub-id pub-id-type="doi">10.2196/56797</pub-id><pub-id pub-id-type="medline">39265163</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Kunz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmid</surname><given-names>C</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Technology-supported self-triage decision making</article-title><source>NPJ Health Syst</source><year>2025</year><month>01</month><day>25</day><volume>2</volume><issue>1</issue><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1038/s44401-024-00008-x</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>von Kalckreuth</surname><given-names>N</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Accuracy of online symptom assessment applications, large language models, and laypeople for self-triage decisions</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>25</day><volume>8</volume><issue>1</issue><fpage>178</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01566-6</pub-id><pub-id pub-id-type="medline">40133390</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Memarian</surname><given-names>B</given-names> </name><name name-style="western"><surname>Doleck</surname><given-names>T</given-names> </name></person-group><article-title>ChatGPT in education: methods, potentials, and limitations</article-title><source>Comput Hum Behav Artif Humans</source><year>2023</year><month>08</month><volume>1</volume><issue>2</issue><fpage>100022</fpage><pub-id pub-id-type="doi">10.1016/j.chbah.2023.100022</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ajevski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Barker</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hardie</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>F</given-names> </name></person-group><article-title>ChatGPT and the future of legal education and practice</article-title><source>Law Teach</source><year>2023</year><month>07</month><day>3</day><volume>57</volume><issue>3</issue><fpage>352</fpage><lpage>364</lpage><pub-id pub-id-type="doi">10.1080/03069400.2023.2207426</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Henrickson</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mero&#x00F1;o-Pe&#x00F1;uela</surname><given-names>A</given-names> </name></person-group><article-title>Prompting meaning: a hermeneutic approach to optimising prompt engineering with ChatGPT</article-title><source>AI Soc</source><year>2025</year><month>02</month><volume>40</volume><issue>2</issue><fpage>903</fpage><lpage>918</lpage><pub-id pub-id-type="doi">10.1007/s00146-023-01752-8</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Housley</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dahl</surname><given-names>P</given-names> </name></person-group><article-title>Membership categorisation, sociological description and role prompt engineering with ChatGPT</article-title><source>Discourse Commun</source><year>2024</year><month>12</month><volume>18</volume><issue>6</issue><fpage>848</fpage><lpage>858</lpage><pub-id pub-id-type="doi">10.1177/17504813241267068</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>4</day><volume>25</volume><issue>1</issue><fpage>e50638</fpage><pub-id pub-id-type="doi">10.2196/50638</pub-id><pub-id pub-id-type="medline">37792434</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Reynolds</surname><given-names>L</given-names> </name><name name-style="western"><surname>McDonell</surname><given-names>K</given-names> </name></person-group><article-title>Prompt programming for large language models: beyond the few-shot paradigm</article-title><conf-name>Extended Abstracts of the 2021 CHI Conference on Human Factors in Computing Systems (CHI EA &#x2019;21)</conf-name><conf-date>May 8-13, 2021</conf-date><pub-id pub-id-type="doi">10.1145/3411763.3451760</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schulhoff</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ilie</surname><given-names>M</given-names> </name><name name-style="western"><surname>Balepur</surname><given-names>N</given-names> </name><etal/></person-group><article-title>The prompt report: a systematic survey of prompt engineering techniques</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.06608</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Si</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zettlemoyer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Boyd-Graber</surname><given-names>J</given-names> </name></person-group><article-title>Getting more out of mixture of language model reasoning experts</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2023</conf-name><conf-date>Dec 6-10, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.552</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sorensen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rytting</surname><given-names>C</given-names> </name><etal/></person-group><article-title>An information-theoretic approach to prompt engineering without ground truth labels</article-title><conf-name>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>May 22-27, 2022</conf-date><pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.60</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Self-consistency improves chain of thought reasoning in language models</article-title><access-date>2026-03-18</access-date><conf-name>Proceedings of the 11th International Conference on Learning Representations (ICLR 2023)</conf-name><conf-date>May 1-5, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=1PL1NIMMrw">https://openreview.net/pdf?id=1PL1NIMMrw</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Madaan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tandon</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>P</given-names> </name></person-group><article-title>SELF-REFINE: iterative refinement with SELF-feedback</article-title><access-date>2026-03-18</access-date><conf-name>Proceedings of the 37th International Conference on Neural Information Processing Systems (NeurIPS 2023)</conf-name><conf-date>Dec 10-16, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=S37hOerQLB">https://openreview.net/pdf?id=S37hOerQLB</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Weng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Large language models are better reasoners with self-verification</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2023</conf-name><conf-date>Dec 6-10, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.167</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Baral</surname><given-names>C</given-names> </name></person-group><article-title>Is a question decomposition unit all we need?</article-title><conf-name>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 7-11, 2022</conf-date><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.302</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lan</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Plan-and-solve prompting: improving zero-shot chain-of-thought reasoning by large language models</article-title><conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>Jul 9-14, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.acl-long.147</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><access-date>2026-03-18</access-date><conf-name>NIPS&#x2019;22: Proceedings of the 36th International Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3600270.3602070">https://dl.acm.org/doi/10.5555/3600270.3602070</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Besta</surname><given-names>M</given-names> </name><name name-style="western"><surname>Blach</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kubicek</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Graph of thoughts: solving elaborate problems with large language models</article-title><conf-name>Proceedings of the Thirty-Eighth AAAI Conference on Artificial Intelligence and Thirty-Sixth Conference on Innovative Applications of Artificial Intelligence and Fourteenth Symposium on Educational Advances in Artificial Intelligence</conf-name><conf-date>Feb 27 to Mar 3, 2024</conf-date><pub-id pub-id-type="doi">10.1609/aaai.v38i16.29720</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Reasoning models</article-title><source>OpenAI Developers</source><access-date>2025-06-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://developers.openai.com/api/docs/guides/reasoning">https://developers.openai.com/api/docs/guides/reasoning</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Iwasawa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kojima</surname><given-names>T</given-names> </name><name name-style="western"><surname>Matsuo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Reid</surname><given-names>M</given-names> </name></person-group><article-title>Large language models are zero-shot reasoners</article-title><conf-name>NIPS&#x2019;22: Proceedings of the 36th International Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><pub-id pub-id-type="doi">10.52202/068431-1613</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Househ</surname><given-names>M</given-names> </name></person-group><article-title>Chain of thought strategy for smaller LLMs for medical reasoning</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>05</month><day>15</day><volume>327</volume><fpage>783</fpage><lpage>787</lpage><pub-id pub-id-type="doi">10.3233/SHTI250466</pub-id><pub-id pub-id-type="medline">40380574</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Leon</surname><given-names>M</given-names> </name></person-group><article-title>GPT-5 and open-weight large language models: advances in reasoning, transparency, and control</article-title><source>Inf Syst</source><year>2026</year><month>02</month><volume>136</volume><fpage>102620</fpage><pub-id pub-id-type="doi">10.1016/j.is.2025.102620</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Safari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Capabilities of GPT-5 on multimodal medical reasoning</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 11, 2025</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2508.08224</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flach</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Reynolds</surname><given-names>PL</given-names> </name><name name-style="western"><surname>Parker</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Kellogg</surname><given-names>KM</given-names> </name></person-group><article-title>Decisionmaking in practice: the dynamics of muddling through</article-title><source>Appl Ergon</source><year>2017</year><month>09</month><volume>63</volume><fpage>133</fpage><lpage>141</lpage><pub-id pub-id-type="doi">10.1016/j.apergo.2017.03.017</pub-id><pub-id pub-id-type="medline">28502402</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shojaee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mirzadeh</surname><given-names>I</given-names> </name><name name-style="western"><surname>Alizadeh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Horton</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>S</given-names> </name><name name-style="western"><surname>Farajtabar</surname><given-names>M</given-names> </name></person-group><article-title>The illusion of thinking</article-title><source>SI</source><year>2025</year><volume>2</volume><issue>6</issue><comment><ext-link ext-link-type="uri" xlink:href="https://s-rsa.com/index.php/agi/issue/view/1413">https://s-rsa.com/index.php/agi/issue/view/1413</ext-link></comment><pub-id pub-id-type="doi">10.70777/si.v2i6.15919</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lawsen</surname><given-names>A</given-names> </name></person-group><article-title>Comment on the illusion of thinking: understanding the strengths and limitations of reasoning models via the lens of problem complexity</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 10, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2506.09250</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kahneman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>G</given-names> </name></person-group><article-title>Conditions for intuitive expertise: a failure to disagree</article-title><source>Am Psychol</source><year>2009</year><month>09</month><volume>64</volume><issue>6</issue><fpage>515</fpage><lpage>526</lpage><pub-id pub-id-type="doi">10.1037/a0016755</pub-id><pub-id pub-id-type="medline">19739881</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Samuel</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Peter</surname><given-names>MT</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Viale</surname><given-names>R</given-names> </name></person-group><article-title>Ecological rationality: bounded rationality in an evolutionary light</article-title><source>Routledge Handbook of Bounded Rationality</source><year>2020</year><edition>1</edition><publisher-name>Routledge</publisher-name><fpage>280</fpage><lpage>312</lpage><pub-id pub-id-type="other">9781315658353</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dhami</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Hertwig</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hoffrage</surname><given-names>U</given-names> </name></person-group><article-title>The role of representative design in an ecological approach to cognition</article-title><source>Psychol Bull</source><year>2004</year><month>11</month><volume>130</volume><issue>6</issue><fpage>959</fpage><lpage>988</lpage><pub-id pub-id-type="doi">10.1037/0033-2909.130.6.959</pub-id><pub-id pub-id-type="medline">15535744</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klein</surname><given-names>G</given-names> </name></person-group><article-title>Naturalistic decision making</article-title><source>Hum Factors</source><year>2008</year><month>06</month><volume>50</volume><issue>3</issue><fpage>456</fpage><lpage>460</lpage><pub-id pub-id-type="doi">10.1518/001872008X288385</pub-id><pub-id pub-id-type="medline">18689053</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLoS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eriksen</surname><given-names>AV</given-names> </name><name name-style="western"><surname>M&#x00F6;ller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ryg</surname><given-names>J</given-names> </name></person-group><article-title>Use of GPT-4 to diagnose complex clinical cases</article-title><source>NEJM AI</source><year>2024</year><month>01</month><volume>1</volume><issue>1</issue><fpage>AIp2300031</fpage><pub-id pub-id-type="doi">10.1056/AIp2300031</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>LB</given-names> </name><name name-style="western"><surname>Gudera</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Wiegand</surname><given-names>TLT</given-names> </name><name name-style="western"><surname>Allmendinger</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dimitriadis</surname><given-names>K</given-names> </name><name name-style="western"><surname>Koerte</surname><given-names>IK</given-names> </name></person-group><article-title>ChatGPT passes German state examination in medicine with picture questions omitted</article-title><source>Dtsch Arztebl Int</source><year>2023</year><month>05</month><day>30</day><volume>120</volume><issue>21</issue><fpage>373</fpage><lpage>374</lpage><pub-id pub-id-type="doi">10.3238/arztebl.m2023.0113</pub-id><pub-id pub-id-type="medline">37530052</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Hickman</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Monahan</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Schwarcz</surname><given-names>D</given-names> </name></person-group><article-title>ChatGPT goes to law school</article-title><source>J Legal Educ</source><year>2021</year><volume>71</volume><fpage>387</fpage><lpage>400</lpage><pub-id pub-id-type="doi">10.2139/ssrn.4335905</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Freitas</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Gomes</surname><given-names>LM</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Moniz</surname><given-names>N</given-names> </name><name name-style="western"><surname>Vale</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cascalho</surname><given-names>J</given-names> </name><name name-style="western"><surname>Silva</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sebasti&#x00E3;o</surname><given-names>R</given-names> </name></person-group><article-title>Does ChatGPT pass the Brazilian bar exam?</article-title><source>Progress in Artificial Intelligence: 22nd EPIA Conference on Artificial Intelligence, EPIA 2023, Faial Island, Azores, September 5&#x2013;8, 2023, Proceedings, Part II</source><year>2023</year><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-3-031-49011-8_11</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Winter</surname><given-names>JCF</given-names> </name></person-group><article-title>Can ChatGPT pass high school exams on English language comprehension?</article-title><source>Int J Artif Intell Educ</source><year>2024</year><month>09</month><volume>34</volume><issue>3</issue><fpage>915</fpage><lpage>930</lpage><pub-id pub-id-type="doi">10.1007/s40593-023-00372-z</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lewandowski</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0141;ukowicz</surname><given-names>P</given-names> </name><name name-style="western"><surname>&#x015A;wietlik</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bara&#x0144;ska-Rybak</surname><given-names>W</given-names> </name></person-group><article-title>ChatGPT-3.5 and ChatGPT-4 dermatological knowledge level based on the Specialty Certificate Examination in Dermatology</article-title><source>Clin Exp Dermatol</source><year>2024</year><month>06</month><day>25</day><volume>49</volume><issue>7</issue><fpage>686</fpage><lpage>691</lpage><pub-id pub-id-type="doi">10.1093/ced/llad255</pub-id><pub-id pub-id-type="medline">37540015</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Large language models are less effective at clinical prediction tasks than locally trained machine learning models</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>05</month><day>1</day><volume>32</volume><issue>5</issue><fpage>811</fpage><lpage>822</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf038</pub-id><pub-id pub-id-type="medline">40056436</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simon</surname><given-names>HA</given-names> </name></person-group><article-title>A behavioral model of rational choice</article-title><source>Q J Econ</source><year>1955</year><month>02</month><volume>69</volume><issue>1</issue><fpage>99</fpage><pub-id pub-id-type="doi">10.2307/1884852</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hertwig</surname><given-names>R</given-names> </name><name name-style="western"><surname>Leuker</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pachur</surname><given-names>T</given-names> </name><name name-style="western"><surname>Spiliopoulos</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pleskac</surname><given-names>TJ</given-names> </name></person-group><article-title>Studies in ecological rationality</article-title><source>Top Cogn Sci</source><year>2022</year><month>07</month><volume>14</volume><issue>3</issue><fpage>467</fpage><lpage>491</lpage><pub-id pub-id-type="doi">10.1111/tops.12567</pub-id><pub-id pub-id-type="medline">34310848</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klein</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jarosz</surname><given-names>A</given-names> </name></person-group><article-title>A naturalistic study of insight</article-title><source>J Cogn Eng Decis Mak</source><year>2011</year><month>12</month><volume>5</volume><issue>4</issue><fpage>335</fpage><lpage>351</lpage><pub-id pub-id-type="doi">10.1177/1555343411427013</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gigerenzer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Brighton</surname><given-names>H</given-names> </name></person-group><article-title>Homo heuristicus: why biased minds make better inferences</article-title><source>Top Cogn Sci</source><year>2009</year><month>01</month><volume>1</volume><issue>1</issue><fpage>107</fpage><lpage>143</lpage><pub-id pub-id-type="doi">10.1111/j.1756-8765.2008.01006.x</pub-id><pub-id pub-id-type="medline">25164802</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gigerenzer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gaissmaier</surname><given-names>W</given-names> </name></person-group><article-title>Heuristic decision making</article-title><source>Annu Rev Psychol</source><year>2011</year><volume>62</volume><fpage>451</fpage><lpage>482</lpage><pub-id pub-id-type="doi">10.1146/annurev-psych-120709-145346</pub-id><pub-id pub-id-type="medline">21126183</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Luan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gigerenzer</surname><given-names>G</given-names> </name></person-group><article-title>Modeling fast-and-frugal heuristics</article-title><source>Psych J</source><year>2022</year><month>08</month><volume>11</volume><issue>4</issue><fpage>600</fpage><lpage>611</lpage><pub-id pub-id-type="doi">10.1002/pchj.576</pub-id><pub-id pub-id-type="medline">35778774</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Artinger</surname><given-names>FM</given-names> </name><name name-style="western"><surname>Gigerenzer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jacobs</surname><given-names>P</given-names> </name></person-group><article-title>Satisficing: integrating two traditions</article-title><source>J Econ Lit</source><year>2022</year><month>06</month><volume>60</volume><issue>2</issue><fpage>598</fpage><lpage>635</lpage><pub-id pub-id-type="doi">10.1257/jel.20201396</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grove</surname><given-names>WM</given-names> </name><name name-style="western"><surname>Zald</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Lebow</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Snitz</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Nelson</surname><given-names>C</given-names> </name></person-group><article-title>Clinical versus mechanical prediction: a meta-analysis</article-title><source>Psychol Assess</source><year>2000</year><month>03</month><volume>12</volume><issue>1</issue><fpage>19</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.1037/1040-3590.12.1.19</pub-id><pub-id pub-id-type="medline">10752360</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karelaia</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hogarth</surname><given-names>RM</given-names> </name></person-group><article-title>Determinants of linear judgment: a meta-analysis of lens model studies</article-title><source>Psychol Bull</source><year>2008</year><month>05</month><volume>134</volume><issue>3</issue><fpage>404</fpage><lpage>426</lpage><pub-id pub-id-type="doi">10.1037/0033-2909.134.3.404</pub-id><pub-id pub-id-type="medline">18444703</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Klein</surname><given-names>GA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Klein</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Orasanu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Calderwood</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zsambok</surname><given-names>CE</given-names> </name></person-group><article-title>A recognition-primed decision (RPD) model of rapid decision making</article-title><source>Decision Making in Action: Models and Methods</source><year>1993</year><access-date>2026-03-18</access-date><publisher-name>Ablex Publishing</publisher-name><fpage>138</fpage><lpage>147</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://psycnet.apa.org/record/1993-97634-006">https://psycnet.apa.org/record/1993-97634-006</ext-link></comment><pub-id pub-id-type="other">978-0-89391-794-4</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Klein</surname><given-names>G</given-names> </name><name name-style="western"><surname>Phillips</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Rall</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Peluso</surname><given-names>DA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Hoffman</surname><given-names>RR</given-names> </name></person-group><article-title>A data-frame theory of sensemaking</article-title><source>Expertise Out of Context: Proceedings of the Sixth International Conference on Naturalistic Decision Making</source><year>2007</year><publisher-name>Lawrence Erlbaum Associates Publishers</publisher-name><fpage>113</fpage><lpage>155</lpage><pub-id pub-id-type="doi">10.4324/9780203810088</pub-id><pub-id pub-id-type="other">978-0-8058-5510-4</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reale</surname><given-names>C</given-names> </name><name name-style="western"><surname>Salwei</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Militello</surname><given-names>LG</given-names> </name><etal/></person-group><article-title>Decision&#x2011;making during high&#x2011;risk events: a systematic literature review</article-title><source>J Cogn Eng Decis Mak</source><year>2023</year><month>06</month><volume>17</volume><issue>2</issue><fpage>188</fpage><lpage>212</lpage><pub-id pub-id-type="doi">10.1177/15553434221147415</pub-id><pub-id pub-id-type="medline">37823061</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ahuna</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Becker</surname><given-names>KD</given-names> </name></person-group><article-title>Scoping review of naturalistic decision making studies among mental health professionals: coverage of characteristics and contexts</article-title><source>J Cogn Eng Decis Mak</source><year>2025</year><month>03</month><volume>19</volume><issue>1</issue><fpage>96</fpage><lpage>129</lpage><pub-id pub-id-type="doi">10.1177/15553434241303806</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>How to evaluate the accuracy of symptom checkers and diagnostic decision support systems: Symptom Checker Accuracy Reporting Framework (SCARF)</article-title><source>JMIR Hum Factors</source><year>2026</year><month>01</month><day>16</day><volume>13</volume><fpage>e76168</fpage><pub-id pub-id-type="doi">10.2196/76168</pub-id><pub-id pub-id-type="medline">41544248</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Semigran</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Linder</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Gidengil</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mehrotra</surname><given-names>A</given-names> </name></person-group><article-title>Evaluation of symptom checkers for self diagnosis and triage: audit study</article-title><source>BMJ</source><year>2015</year><month>07</month><day>8</day><volume>351</volume><fpage>h3480</fpage><pub-id pub-id-type="doi">10.1136/bmj.h3480</pub-id><pub-id pub-id-type="medline">26157077</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hill</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Sim</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mills</surname><given-names>B</given-names> </name></person-group><article-title>The quality of diagnosis and triage advice provided by free online symptom checkers and apps in Australia</article-title><source>Med J Aust</source><year>2020</year><month>06</month><volume>212</volume><issue>11</issue><fpage>514</fpage><lpage>519</lpage><pub-id pub-id-type="doi">10.5694/mja2.50600</pub-id><pub-id pub-id-type="medline">32391611</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>S&#x00E6;tre</surname><given-names>LMS</given-names> </name><name name-style="western"><surname>Raasth&#x00F8;j</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lauridsen</surname><given-names>GB</given-names> </name><etal/></person-group><article-title>Revisiting the symptom iceberg based on the Danish symptom cohort - symptom experiences and healthcare-seeking behaviour in the general Danish population in 2022</article-title><source>Heliyon</source><year>2024</year><month>05</month><day>30</day><volume>10</volume><issue>10</issue><fpage>e31090</fpage><pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e31090</pub-id><pub-id pub-id-type="medline">38803940</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McAteer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Elliott</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Hannaford</surname><given-names>PC</given-names> </name></person-group><article-title>Ascertaining the size of the symptom iceberg in a UK-wide community-based survey</article-title><source>Br J Gen Pract</source><year>2011</year><month>01</month><volume>61</volume><issue>582</issue><fpage>e1</fpage><lpage>e11</lpage><pub-id pub-id-type="doi">10.3399/bjgp11X548910</pub-id><pub-id pub-id-type="medline">21401979</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Scatturin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Napierala</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Characteristics of users and nonusers of symptom checkers in Germany: cross-sectional survey study</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>20</day><volume>25</volume><fpage>e46231</fpage><pub-id pub-id-type="doi">10.2196/46231</pub-id><pub-id pub-id-type="medline">37338970</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>HL</given-names> </name></person-group><article-title>Evaluation of artificial intelligence for patient self-triage: comparison of general-purpose AI platforms with the NHS 111 online symptom checker in the United Kingdom</article-title><source>Cureus</source><year>2025</year><month>11</month><volume>17</volume><issue>11</issue><fpage>e97834</fpage><pub-id pub-id-type="doi">10.7759/cureus.97834</pub-id><pub-id pub-id-type="medline">41458776</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Evaluating the accuracy of ChatGPT model versions for giving care-seeking advice</article-title><source>Commun Med (Lond)</source><year>2026</year><month>02</month><day>25</day><volume>6</volume><issue>1</issue><fpage>171</fpage><pub-id pub-id-type="doi">10.1038/s43856-026-01466-0</pub-id><pub-id pub-id-type="medline">41735505</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fraser</surname><given-names>H</given-names> </name><name name-style="western"><surname>Crossland</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bacher</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ranney</surname><given-names>M</given-names> </name><name name-style="western"><surname>Madsen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hilliard</surname><given-names>R</given-names> </name></person-group><article-title>Comparison of diagnostic and triage accuracy of Ada Health and WebMD symptom checkers, ChatGPT, and physicians for patients in an emergency department: clinical data analysis study</article-title><source>JMIR mHealth uHealth</source><year>2023</year><month>10</month><day>3</day><volume>11</volume><fpage>e49995</fpage><pub-id pub-id-type="doi">10.2196/49995</pub-id><pub-id pub-id-type="medline">37788063</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levine</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Tuwani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kompa</surname><given-names>B</given-names> </name><etal/></person-group><article-title>The diagnostic and triage accuracy of the GPT-3 artificial intelligence model: an observational study</article-title><source>Lancet Digit Health</source><year>2024</year><month>08</month><volume>6</volume><issue>8</issue><fpage>e555</fpage><lpage>e561</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00097-9</pub-id><pub-id pub-id-type="medline">39059888</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Napierala</surname><given-names>H</given-names> </name><name name-style="western"><surname>Privoznik</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sapunova</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>The RepVig framework for designing use-case specific representative vignettes and evaluating triage accuracy of laypeople and symptom assessment applications</article-title><source>Sci Rep</source><year>2024</year><month>12</month><day>23</day><volume>14</volume><issue>1</issue><fpage>30614</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-83844-z</pub-id><pub-id pub-id-type="medline">39715767</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Fletcher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Nielsen</surname><given-names>RK</given-names> </name></person-group><article-title>What does the public in six countries think of generative AI in news?</article-title><year>2024</year><access-date>2026-03-18</access-date><publisher-name>Reuters Institute for the Study of Journalism, University of Oxford</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://reutersinstitute.politics.ox.ac.uk/sites/default/files/2024-05/Fletcher_and_Nielsen_Generative_AI_and_News_Audiences.pdf">https://reutersinstitute.politics.ox.ac.uk/sites/default/files/2024-05/Fletcher_and_Nielsen_Generative_AI_and_News_Audiences.pdf</ext-link></comment><pub-id pub-id-type="doi">10.60625/RISJ-4ZB8-CG87</pub-id></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmieding</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>K</given-names> </name><name name-style="western"><surname>Schulz-Niethammer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Balzer</surname><given-names>F</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Triage accuracy of symptom checker apps: 5-year follow-up evaluation</article-title><source>J Med Internet Res</source><year>2022</year><month>05</month><day>10</day><volume>24</volume><issue>5</issue><fpage>e31810</fpage><pub-id pub-id-type="doi">10.2196/31810</pub-id><pub-id pub-id-type="medline">35536633</pub-id></nlm-citation></ref><ref id="ref74"><label>74</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ceney</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tolond</surname><given-names>S</given-names> </name><name name-style="western"><surname>Glowinski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marks</surname><given-names>B</given-names> </name><name name-style="western"><surname>Swift</surname><given-names>S</given-names> </name><name name-style="western"><surname>Palser</surname><given-names>T</given-names> </name></person-group><article-title>Accuracy of online symptom checkers and the potential impact on service utilisation</article-title><source>PLoS One</source><year>2021</year><volume>16</volume><issue>7</issue><fpage>e0254088</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0254088</pub-id><pub-id pub-id-type="medline">34265845</pub-id></nlm-citation></ref><ref id="ref75"><label>75</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kawamura</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>ChatGPT-Generated differential diagnosis lists for complex case-derived clinical vignettes: diagnostic accuracy evaluation</article-title><source>JMIR Med Inform</source><year>2023</year><month>10</month><day>9</day><volume>11</volume><fpage>e48808</fpage><pub-id pub-id-type="doi">10.2196/48808</pub-id><pub-id pub-id-type="medline">37812468</pub-id></nlm-citation></ref><ref id="ref76"><label>76</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garg</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Urs</surname><given-names>VL</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Chaudhary</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Paliwal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kar</surname><given-names>SK</given-names> </name></person-group><article-title>Exploring the role of ChatGPT in patient care (diagnosis and treatment) and medical research: a systematic review</article-title><source>Health Promot Perspect</source><year>2023</year><volume>13</volume><issue>3</issue><fpage>183</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.34172/hpp.2023.22</pub-id><pub-id pub-id-type="medline">37808939</pub-id></nlm-citation></ref><ref id="ref77"><label>77</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hose</surname><given-names>BZ</given-names> </name><name name-style="western"><surname>Rounds</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Nandwani</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Use of ChatGPT for urinary symptom management among people with spinal cord injury or disease: qualitative study</article-title><source>JMIR Rehabil Assist Technol</source><year>2025</year><month>05</month><day>29</day><volume>12</volume><fpage>e70339</fpage><pub-id pub-id-type="doi">10.2196/70339</pub-id><pub-id pub-id-type="medline">40440564</pub-id></nlm-citation></ref><ref id="ref78"><label>78</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al Shboul</surname><given-names>MKI</given-names> </name><name name-style="western"><surname>Alwreikat</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alotaibi</surname><given-names>FA</given-names> </name></person-group><article-title>Investigating the use of ChatGPT as a novel method for seeking health information: a qualitative approach</article-title><source>Sci Technol Libr</source><year>2024</year><month>07</month><day>2</day><volume>43</volume><issue>3</issue><fpage>225</fpage><lpage>234</lpage><pub-id pub-id-type="doi">10.1080/0194262X.2023.2250835</pub-id></nlm-citation></ref><ref id="ref79"><label>79</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayo-Ajibola</surname><given-names>O</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Riddell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kravitz</surname><given-names>RL</given-names> </name></person-group><article-title>Characterizing the adoption and experiences of users of artificial intelligence-generated health information in the United States: cross-sectional questionnaire study</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>14</day><volume>26</volume><fpage>e55138</fpage><pub-id pub-id-type="doi">10.2196/55138</pub-id><pub-id pub-id-type="medline">39141910</pub-id></nlm-citation></ref><ref id="ref80"><label>80</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>28</day><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568</pub-id><pub-id pub-id-type="medline">37379067</pub-id></nlm-citation></ref><ref id="ref81"><label>81</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Statistical refinement of patient-centered case vignettes for digital health research</article-title><source>Front Digit Health</source><year>2024</year><volume>6</volume><fpage>1411924</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2024.1411924</pub-id><pub-id pub-id-type="medline">39498100</pub-id></nlm-citation></ref><ref id="ref82"><label>82</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Berner</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Schmieding</surname><given-names>ML</given-names> </name></person-group><article-title>How suitable are clinical vignettes for the evaluation of symptom checker apps? A test theoretical perspective</article-title><source>Digit Health</source><year>2023</year><volume>9</volume><fpage>20552076231194929</fpage><pub-id pub-id-type="doi">10.1177/20552076231194929</pub-id><pub-id pub-id-type="medline">37614591</pub-id></nlm-citation></ref><ref id="ref83"><label>83</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Painter</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hayhoe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Riboli-Sasco</surname><given-names>E</given-names> </name><name name-style="western"><surname>El-Osta</surname><given-names>A</given-names> </name></person-group><article-title>Online symptom checkers: recommendations for a vignette-based clinical evaluation standard</article-title><source>J Med Internet Res</source><year>2022</year><month>10</month><day>26</day><volume>24</volume><issue>10</issue><fpage>e37408</fpage><pub-id pub-id-type="doi">10.2196/37408</pub-id><pub-id pub-id-type="medline">36287594</pub-id></nlm-citation></ref><ref id="ref84"><label>84</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arellano Carmona</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chittamuru</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kravitz</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Ramondt</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ram&#x00ED;rez</surname><given-names>AS</given-names> </name></person-group><article-title>Health information seeking from an intelligent web-based symptom checker: cross-sectional questionnaire study</article-title><source>J Med Internet Res</source><year>2022</year><month>08</month><day>19</day><volume>24</volume><issue>8</issue><fpage>e36322</fpage><pub-id pub-id-type="doi">10.2196/36322</pub-id><pub-id pub-id-type="medline">35984690</pub-id></nlm-citation></ref><ref id="ref85"><label>85</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ito</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kadomatsu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fujisawa</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The accuracy and potential racial and ethnic biases of GPT-4 in the diagnosis and triage of health conditions: evaluation study</article-title><source>JMIR Med Educ</source><year>2023</year><month>11</month><day>2</day><volume>9</volume><fpage>e47532</fpage><pub-id pub-id-type="doi">10.2196/47532</pub-id><pub-id pub-id-type="medline">37917120</pub-id></nlm-citation></ref><ref id="ref86"><label>86</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Epstein</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Dexter</surname><given-names>F</given-names> </name></person-group><article-title>Variability in large language models' responses to medical licensing and certification examinations. Comment on "How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment"</article-title><source>JMIR Med Educ</source><year>2023</year><month>07</month><day>13</day><volume>9</volume><fpage>e48305</fpage><pub-id pub-id-type="doi">10.2196/48305</pub-id><pub-id pub-id-type="medline">37440293</pub-id></nlm-citation></ref><ref id="ref87"><label>87</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Franc</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hart</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hata</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hertelendy</surname><given-names>A</given-names> </name></person-group><article-title>Repeatability, reproducibility, and diagnostic accuracy of a commercial large language model (ChatGPT) to perform emergency department triage using the Canadian triage and acuity scale</article-title><source>CJEM</source><year>2024</year><month>01</month><volume>26</volume><issue>1</issue><fpage>40</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1007/s43678-023-00616-w</pub-id><pub-id pub-id-type="medline">38206515</pub-id></nlm-citation></ref><ref id="ref88"><label>88</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Software symptomcheckR: an R package for analyzing and visualizing symptom checker triage performance</article-title><source>BMC Digit Health</source><year>2024</year><month>07</month><day>22</day><volume>2</volume><issue>1</issue><fpage>43</fpage><pub-id pub-id-type="doi">10.1186/s44247-024-00096-7</pub-id></nlm-citation></ref><ref id="ref89"><label>89</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wickham</surname><given-names>H</given-names> </name><name name-style="western"><surname>Averick</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bryan</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Welcome to the Tidyverse</article-title><source>J Open Source Softw</source><year>2019</year><month>11</month><day>21</day><volume>4</volume><issue>43</issue><fpage>1686</fpage><pub-id pub-id-type="doi">10.21105/joss.01686</pub-id></nlm-citation></ref><ref id="ref90"><label>90</label><nlm-citation citation-type="web"><article-title>Psych: procedures for psychological, psychometric, and personality research</article-title><source>The Comprehensive R Archive Network</source><access-date>2025-08-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=psych">https://CRAN.R-project.org/package=psych</ext-link></comment></nlm-citation></ref><ref id="ref91"><label>91</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bates</surname><given-names>D</given-names> </name><name name-style="western"><surname>M&#x00E4;chler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bolker</surname><given-names>B</given-names> </name><name name-style="western"><surname>Walker</surname><given-names>S</given-names> </name></person-group><article-title>Fitting linear mixed-effects models using lme4</article-title><source>J Stat Soft</source><year>2015</year><volume>67</volume><issue>1</issue><pub-id pub-id-type="doi">10.18637/jss.v067.i01</pub-id></nlm-citation></ref><ref id="ref92"><label>92</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>J&#x0119;drzejczak</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Skar&#x017C;y&#x0144;ski</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kochanek</surname><given-names>K</given-names> </name></person-group><article-title>Testing new versions of ChatGPT in terms of physiology and electrophysiology of hearing: improved accuracy but not consistency</article-title><source>medRxiv</source><comment>Preprint posted online on  Oct 8, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.10.08.24315089</pub-id></nlm-citation></ref><ref id="ref93"><label>93</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flaharty</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hanchard</surname><given-names>SL</given-names> </name><etal/></person-group><article-title>Evaluating large language models on medical, lay-language, and self-reported descriptions of genetic conditions</article-title><source>Am J Hum Genet</source><year>2024</year><month>09</month><day>5</day><volume>111</volume><issue>9</issue><fpage>1819</fpage><lpage>1833</lpage><pub-id pub-id-type="doi">10.1016/j.ajhg.2024.07.011</pub-id><pub-id pub-id-type="medline">39146935</pub-id></nlm-citation></ref><ref id="ref94"><label>94</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Moore</surname><given-names>J</given-names> </name><name name-style="western"><surname>Grabb</surname><given-names>D</given-names> </name><name name-style="western"><surname>Agnew</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Expressing stigma and inappropriate responses prevents LLMs from safely replacing mental health providers</article-title><year>2025</year><month>06</month><day>23</day><conf-name>FAccT &#x2019;25</conf-name><conf-date>Jun 23-26, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3715275">https://dl.acm.org/doi/proceedings/10.1145/3715275</ext-link></comment><pub-id pub-id-type="doi">10.1145/3715275.3732039</pub-id></nlm-citation></ref><ref id="ref95"><label>95</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>JD</given-names> </name><etal/></person-group><article-title>Multi-model assurance analysis showing large language models are highly vulnerable to adversarial hallucination attacks during clinical decision support</article-title><source>Commun Med (Lond)</source><year>2025</year><month>08</month><day>2</day><volume>5</volume><issue>1</issue><fpage>330</fpage><pub-id pub-id-type="doi">10.1038/s43856-025-01021-3</pub-id><pub-id pub-id-type="medline">40753316</pub-id></nlm-citation></ref><ref id="ref96"><label>96</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Hollnagel</surname><given-names>E</given-names> </name><name name-style="western"><surname>Pedersen</surname><given-names>OM</given-names> </name><name name-style="western"><surname>Rasmussen</surname><given-names>J</given-names> </name></person-group><article-title>Notes on human performance analysis</article-title><year>1981</year><access-date>2026-03-18</access-date><publisher-name>Ris&#x00F8; National Laboratory</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://backend.orbit.dtu.dk/ws/portalfiles/portal/88561089/ris_m_2285.pdf">https://backend.orbit.dtu.dk/ws/portalfiles/portal/88561089/ris_m_2285.pdf</ext-link></comment></nlm-citation></ref><ref id="ref97"><label>97</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Rethinking human-AI collaboration in complex medical decision making: a case study in sepsis diagnosis</article-title><conf-name>Proceedings of the CHI Conference on Human Factors in Computing Systems</conf-name><conf-date>May 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.1145/3613904.3642343</pub-id></nlm-citation></ref><ref id="ref98"><label>98</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>K&#x00E4;mmer</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Ernst</surname><given-names>K</given-names> </name><name name-style="western"><surname>Grab</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Collaboration during the diagnostic decision&#x2010;making process: when does it help?</article-title><source>J Behav Decis Mak</source><year>2024</year><month>01</month><volume>37</volume><issue>1</issue><fpage>e2357</fpage><pub-id pub-id-type="doi">10.1002/bdm.2357</pub-id></nlm-citation></ref><ref id="ref99"><label>99</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name></person-group><article-title>Data set supplementing &#x201C;increasing large language model accuracy for care-seeking advice using prompts reflecting human reasoning strategies in the real world: validation study&#x201D;</article-title><source>Zenodo</source><year>2026</year><month>03</month><day>28</day><access-date>2026-04-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://zenodo.org/records/19291719">https://zenodo.org/records/19291719</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional accuracy and sensitivity analyses.</p><media xlink:href="biomedeng_v11i1e88053_app1.docx" xlink:title="DOCX File, 30 KB"/></supplementary-material></app-group></back></article>