<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
  <channel>
    <title>Evaluation on Probably Aligned</title>
    <link>https://probablyaligned.ai/tags/evaluation/</link>
    <description>Recent content in Evaluation on Probably Aligned</description>
    <generator>Hugo</generator>
    <language>en-us</language>
    <lastBuildDate>Wed, 07 Jan 2026 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://probablyaligned.ai/tags/evaluation/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Why Passing a Safety Test Might Mean Nothing</title>
      <link>https://probablyaligned.ai/safety/formal-methods/detectability-of-testing/</link>
      <pubDate>Wed, 07 Jan 2026 00:00:00 +0000</pubDate>
      <guid>https://probablyaligned.ai/safety/formal-methods/detectability-of-testing/</guid>
      <description>Simulated environments are our best tool for probing alignment. But if a model can distinguish test from deployment — and it almost certainly can — then testing tells you what the model does when it knows it&amp;#39;s being watched. Nothing more.</description>
    </item>
  </channel>
</rss>
