update

2024-10-03 22:58:39 -05:00 · 2024-10-03 22:58:39 -05:00 · 784b5d10ed
parent a904ce40c9
commit 784b5d10ed
18 changed files with 6369 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,85 @@
-# haloscope
-source code for NeurIPS'24 paper "HaloScope: Harnessing Unlabeled LLM Generations for Hallucination Detection"
+# HaloScope

-PLEASE STAY TUNED!
+
+This is the source code accompanying the NeurIPS'24 spotlight [***HaloScope: Harnessing Unlabeled LLM Generations for Hallucination Detection***](https://arxiv.org/abs/2409.17504) by Xuefeng Du, Chaowei Xiao, and Yixuan Li
+
+## Requirements
+```
+conda install -f env.yml
+```
+
+## Models Preparation
+
+Please download the LLaMA-2 7b / 13b  from [here](https://huggingface.co/meta-llama) and OPT [6.7b]((https://huggingface.co/facebook/opt-6.7b)) / [13b]((https://huggingface.co/facebook/opt-13b)) models. Setup a local directory for saving the models:
+```angular2html
+mkdir models
+```
+And put the model checkpoints inside the folder.
+## Get LLM generations
+
+Firstly, make a local directory for saving the LLM-generated answers, model-generated truthfulness ground truth, and features, etc.
+```angular2html
+mkdir save_for_eval
+```
+
+For TruthfulQA, please run:
+
+```angular2html
+CUDA_VISIBLE_DEVICES=0 python hal_det_llama.py --dataset_name tqa --model_name llama2_chat_7B --most_likely 1 --num_gene 1 --gene 1
+```
+- "most_likely" means whether you want to generate the most likely answers for testing (most_likely == 1) or generate multiple answers with sampling techniques for uncertainty estimation.
+- "num_gene" is how many samples we generate for each question, for most_likely==1, num_gene should be 1 otherwise we set num_gene to 10.
+- "dataset_name" can be chosen from tqa, coqa, triviaqa, tydiqa
+- "model_name" can be chosen from llama2_chat_7B, and llama2_chat_13B
+
+Please check section 4.1 implementation details in the paper for reference.
+
+For OPT models, please run:
+```angular2html
+CUDA_VISIBLE_DEVICES=0 python hal_det_opt.py --dataset_name tqa --model_name opt-6.7b --most_likely 1 --num_gene 1 --gene 1
+```
+
+## Get the ground truth for the LLM generations
+Since there is no ground truth for the generated answers, we leverage rouge and [BleuRT](https://arxiv.org/abs/2004.04696) for getting a sense of whether the answer is true or false.
+
+To download the Bleurt models, please refer to [here](https://github.com/lucadiliello/bleurt-pytorch) and put the model to the ./models folder:
+
+For TruthfulQA, please run:
+
+```angular2html
+CUDA_VISIBLE_DEVICES=0 python hal_det_llama.py --dataset_name tqa --model_name llama2_chat_7B --most_likely 1 --use_rouge 0 --generate_gt 1
+```
+
+- when "use_rouge" is 1, then we use rouge for determining the ground truth, otherwise we use BleuRT.
+
+For OPT models, please run:
+```angular2html
+CUDA_VISIBLE_DEVICES=0 python hal_det_opt.py --dataset_name tqa --model_name opt-6.7b --most_likely 1 --use_rouge 0 --generate_gt 1
+```
+
+## Hallucination detection
+
+For TruthfulQA, please run:
+```angular2html
+CUDA_VISIBLE_DEVICES=0 python hal_det_llama.py --dataset_name tqa --model_name llama2_chat_7B --use_rouge 0 --most_likely 1 --weighted_svd 1 --feat_loc_svd 3
+```
+- "weighted_svd" denotes whether we need the weighting coeffcient by the singular values in the score.
+- "feat_loc_svd" denotes which location in a transformer block do we extract the representations, 3 is block output, 2 is mlp output and 1 is attention head output.
+
+
+For OPT models, please run:
+```angular2html
+CUDA_VISIBLE_DEVICES=0 python hal_det_opt.py --dataset_name tqa --model_name opt-6.7b --use_rouge 0 --most_likely 1 --weighted_svd 1 --feat_loc_svd 3
+```
+
+## Citation ##
+If you found any part of this code is useful in your research, please consider citing our paper:
+
+```
+ @inproceedings{du2024haloscope,
+      title={ HaloScope: Harnessing Unlabeled LLM Generations for Hallucination Detection}, 
+      author={Xuefeng Du and Chaowei Xiao and Yixuan Li},
+      booktitle={Advances in Neural Information Processing Systems},
+      year = {2024}
+}
+```
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit fdd8ad1c0d00a478cf8b0bb41a3ad8378c16293b
--- a/coqa-dev-v1.0.json
+++ b/coqa-dev-v1.0.json
--- a/env.yml
+++ b/env.yml
@ -0,0 +1,711 @@
+name: base
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _anaconda_depends=2020.07=py38_0
+  - _ipyw_jlab_nb_ext_conf=0.1.0=py38_0
+  - _libgcc_mutex=0.1=main
+  - alabaster=0.7.12=py_0
+  - anaconda=custom=py38_1
+  - anaconda-client=1.7.2=py38_0
+  - anaconda-navigator=1.10.0=py38_0
+  - anaconda-project=0.8.4=py_0
+  - argh=0.26.2=py38_0
+  - argon2-cffi=20.1.0=py38h7b6447c_1
+  - asn1crypto=1.4.0=py_0
+  - astroid=2.4.2=py38_0
+  - astropy=4.0.2=py38h7b6447c_0
+  - async_generator=1.10=py_0
+  - atomicwrites=1.4.0=py_0
+  - autopep8=1.5.4=py_0
+  - babel=2.8.1=pyhd3eb1b0_0
+  - backcall=0.2.0=py_0
+  - backports=1.0=py_2
+  - backports.functools_lru_cache=1.6.1=py_0
+  - backports.shutil_get_terminal_size=1.0.0=py38_2
+  - backports.tempfile=1.0=py_1
+  - backports.weakref=1.0.post1=py_1
+  - beautifulsoup4=4.9.3=pyhb0f4dca_0
+  - bitarray=1.6.1=py38h27cfd23_0
+  - bkcharts=0.2=py38_0
+  - blas=1.0=mkl
+  - bleach=3.2.1=py_0
+  - blosc=1.20.1=hd408876_0
+  - bokeh=2.2.3=py38_0
+  - boto=2.49.0=py38_0
+  - bottleneck=1.3.2=py38heb32a55_1
+  - brotlipy=0.7.0=py38h7b6447c_1000
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2021.5.30=ha878542_0
+  - cairo=1.14.12=h8948797_3
+  - certifi=2021.5.30=py38h578d9bd_0
+  - cffi=1.14.3=py38he30daa8_0
+  - chardet=3.0.4=py38_1003
+  - clyent=1.2.2=py38_1
+  - colorama=0.4.4=py_0
+  - conda=4.10.1=py38h578d9bd_0
+  - conda-build=3.20.5=py38_1
+  - conda-env=2.6.0=1
+  - conda-package-handling=1.7.2=py38h03888b9_0
+  - conda-verify=3.4.2=py_1
+  - contextlib2=0.6.0.post1=py_0
+  - cryptography=3.1.1=py38h1ba5d50_0
+  - curl=7.71.1=hbc83047_1
+  - cycler=0.10.0=py38_0
+  - cython=0.29.21=py38he6710b0_0
+  - cytoolz=0.11.0=py38h7b6447c_0
+  - dask=2.30.0=py_0
+  - dask-core=2.30.0=py_0
+  - dbus=1.13.18=hb2f20db_0
+  - decorator=4.4.2=py_0
+  - defusedxml=0.6.0=py_0
+  - diff-match-patch=20200713=py_0
+  - distributed=2.30.1=py38h06a4308_0
+  - docutils=0.16=py38_1
+  - entrypoints=0.3=py38_0
+  - et_xmlfile=1.0.1=py_1001
+  - expat=2.2.10=he6710b0_2
+  - fastcache=1.1.0=py38h7b6447c_0
+  - fontconfig=2.13.0=h9420a91_0
+  - freetype=2.10.4=h5ab3b9f_0
+  - fribidi=1.0.10=h7b6447c_0
+  - future=0.18.2=py38_1
+  - get_terminal_size=1.0.0=haa9412d_0
+  - gevent=20.9.0=py38h7b6447c_0
+  - glib=2.66.1=h92f7085_0
+  - glob2=0.7=py_0
+  - gmp=6.1.2=h6c8ec71_1
+  - gmpy2=2.0.8=py38hd5f6e3b_3
+  - graphite2=1.3.14=h23475e2_0
+  - greenlet=0.4.17=py38h7b6447c_0
+  - gst-plugins-base=1.14.0=hbbd80ab_1
+  - gstreamer=1.14.0=hb31296c_0
+  - h5py=2.10.0=py38h7918eee_0
+  - harfbuzz=2.4.0=hca77d97_1
+  - hdf5=1.10.4=hb1b8bf9_0
+  - heapdict=1.0.1=py_0
+  - html5lib=1.1=py_0
+  - icu=58.2=he6710b0_3
+  - idna=2.10=py_0
+  - imageio=2.9.0=py_0
+  - imagesize=1.2.0=py_0
+  - importlib_metadata=2.0.0=1
+  - iniconfig=1.1.1=py_0
+  - intel-openmp=2020.2=254
+  - intervaltree=3.1.0=py_0
+  - ipykernel=5.3.4=py38h5ca1d4c_0
+  - ipython=7.19.0=py38hb070fc8_0
+  - ipython_genutils=0.2.0=py38_0
+  - ipywidgets=7.5.1=py_1
+  - jbig=2.1=hdba287a_0
+  - jdcal=1.4.1=py_0
+  - jedi=0.17.1=py38_0
+  - jpeg=9b=h024ee3a_2
+  - json5=0.9.5=py_0
+  - jsonschema=3.2.0=py_2
+  - jupyter=1.0.0=py38_7
+  - jupyter_client=6.1.7=py_0
+  - jupyter_console=6.2.0=py_0
+  - jupyter_core=4.6.3=py38_0
+  - jupyterlab=2.2.6=py_0
+  - jupyterlab_pygments=0.1.2=py_0
+  - jupyterlab_server=1.2.0=py_0
+  - keyring=21.4.0=py38_1
+  - kiwisolver=1.3.0=py38h2531618_0
+  - krb5=1.18.2=h173b8e3_0
+  - lazy-object-proxy=1.4.3=py38h7b6447c_0
+  - lcms2=2.11=h396b838_0
+  - ld_impl_linux-64=2.33.1=h53a641e_7
+  - libarchive=3.4.2=h62408e4_0
+  - libblas=3.9.0=1_h6e990d7_netlib
+  - libcblas=3.9.0=3_h893e4fe_netlib
+  - libcurl=7.71.1=h20c2e04_1
+  - libedit=3.1.20191231=h14c3975_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libgfortran-ng=7.5.0=h14aa051_19
+  - libgfortran4=7.5.0=h14aa051_19
+  - liblapack=3.9.0=3_h893e4fe_netlib
+  - liblief=0.10.1=he6710b0_0
+  - libllvm10=10.0.1=hbcb73fb_5
+  - libllvm9=9.0.1=he513fc3_1
+  - libpng=1.6.37=hbc83047_0
+  - libsodium=1.0.18=h7b6447c_0
+  - libspatialindex=1.9.3=he6710b0_0
+  - libssh2=1.9.0=h1ba5d50_1
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.1.0=h2733197_1
+  - libtool=2.4.6=h7b6447c_1005
+  - libuuid=1.0.3=h1bed415_2
+  - libxcb=1.14=h7b6447c_0
+  - libxml2=2.9.10=hb55368b_3
+  - libxslt=1.1.34=hc22bd24_0
+  - llvmlite=0.34.0=py38h269e1b5_4
+  - locket=0.2.0=py38_1
+  - lxml=4.6.1=py38hefd8a0e_0
+  - lz4-c=1.9.2=heb0550a_3
+  - lzo=2.10=h7b6447c_2
+  - mccabe=0.6.1=py38_1
+  - mistune=0.8.4=py38h7b6447c_1000
+  - mkl=2020.2=256
+  - mkl-service=2.3.0=py38he904b0f_0
+  - mkl_fft=1.2.0=py38h23d657b_0
+  - mkl_random=1.1.1=py38h0573a6f_0
+  - mock=4.0.2=py_0
+  - more-itertools=8.6.0=pyhd3eb1b0_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.1.0=py38_0
+  - msgpack-python=1.0.0=py38hfd86e86_1
+  - multipledispatch=0.6.0=py38_0
+  - navigator-updater=0.2.1=py38_0
+  - nbclient=0.5.1=py_0
+  - nbconvert=6.0.7=py38_0
+  - nbformat=5.0.8=py_0
+  - ncurses=6.2=he6710b0_1
+  - nest-asyncio=1.4.2=pyhd3eb1b0_0
+  - networkx=2.5=py_0
+  - nose=1.3.7=py38_2
+  - notebook=6.1.4=py38_0
+  - numba=0.51.2=py38h0573a6f_1
+  - numexpr=2.7.1=py38h423224d_0
+  - numpydoc=1.1.0=pyhd3eb1b0_1
+  - olefile=0.46=py_0
+  - openjpeg=2.3.1=h981e76c_3
+  - openssl=1.1.1k=h27cfd23_0
+  - pandoc=2.11=hb0f4dca_0
+  - pandocfilters=1.4.3=py38h06a4308_1
+  - pango=1.45.3=hd140c19_0
+  - parso=0.7.0=py_0
+  - partd=1.1.0=py_0
+  - patchelf=0.12=he6710b0_0
+  - path=15.0.0=py38_0
+  - path.py=12.5.0=0
+  - pathlib2=2.3.5=py38_0
+  - pathtools=0.1.2=py_1
+  - pcre=8.44=he6710b0_0
+  - pep8=1.7.1=py38_0
+  - pexpect=4.8.0=py38_0
+  - pickleshare=0.7.5=py38_1000
+  - pillow=8.0.1=py38he98fc37_0
+  - pixman=0.40.0=h7b6447c_0
+  - pkginfo=1.6.1=py38h06a4308_0
+  - pluggy=0.13.1=py38_0
+  - ply=3.11=py38_0
+  - poppler=0.81.0=he6a58d2_1
+  - poppler-data=0.4.10=0
+  - prompt-toolkit=3.0.8=py_0
+  - prompt_toolkit=3.0.8=0
+  - psutil=5.7.2=py38h7b6447c_0
+  - ptyprocess=0.6.0=py38_0
+  - py=1.9.0=py_0
+  - py-lief=0.10.1=py38h403a769_0
+  - pycosat=0.6.3=py38h7b6447c_1
+  - pycparser=2.20=py_2
+  - pycurl=7.43.0.6=py38h1ba5d50_0
+  - pydocstyle=5.1.1=py_0
+  - pygments=2.7.2=pyhd3eb1b0_0
+  - pylint=2.6.0=py38_0
+  - pyodbc=4.0.30=py38he6710b0_0
+  - pyopenssl=19.1.0=py_1
+  - pyparsing=2.4.7=py_0
+  - pyqt=5.9.2=py38h05f1152_4
+  - pyrsistent=0.17.3=py38h7b6447c_0
+  - pysocks=1.7.1=py38_0
+  - pytables=3.6.1=py38h9fd0a39_0
+  - python=3.8.5=h7579374_1
+  - python-jsonrpc-server=0.4.0=py_0
+  - python-language-server=0.35.1=py_0
+  - python-libarchive-c=2.9=py_0
+  - python_abi=3.8=1_cp38
+  - pywavelets=1.1.1=py38h7b6447c_2
+  - pyxdg=0.27=pyhd3eb1b0_0
+  - pyzmq=19.0.2=py38he6710b0_1
+  - qdarkstyle=2.8.1=py_0
+  - qt=5.9.7=h5867ecd_1
+  - qtawesome=1.0.1=py_0
+  - qtconsole=4.7.7=py_0
+  - qtpy=1.9.0=py_0
+  - readline=8.0=h7b6447c_0
+  - ripgrep=12.1.1=0
+  - rope=0.18.0=py_0
+  - rtree=0.9.4=py38_1
+  - ruamel_yaml=0.15.87=py38h7b6447c_1
+  - scikit-image=0.17.2=py38hdf5156a_0
+  - seaborn=0.11.0=py_0
+  - send2trash=1.5.0=py38_0
+  - simplegeneric=0.8.1=py38_2
+  - singledispatch=3.4.0.3=py_1001
+  - sip=4.19.13=py38he6710b0_0
+  - snappy=1.1.8=he1b5a44_3
+  - snowballstemmer=2.0.0=py_0
+  - sortedcollections=1.2.1=py_0
+  - sortedcontainers=2.2.2=py_0
+  - soupsieve=2.0.1=py_0
+  - sphinx=3.2.1=py_0
+  - sphinxcontrib=1.0=py38_1
+  - sphinxcontrib-applehelp=1.0.2=py_0
+  - sphinxcontrib-devhelp=1.0.2=py_0
+  - sphinxcontrib-htmlhelp=1.0.3=py_0
+  - sphinxcontrib-jsmath=1.0.1=py_0
+  - sphinxcontrib-qthelp=1.0.3=py_0
+  - sphinxcontrib-serializinghtml=1.1.4=py_0
+  - sphinxcontrib-websupport=1.2.4=py_0
+  - spyder=4.1.5=py38_0
+  - spyder-kernels=1.9.4=py38_0
+  - sqlalchemy=1.3.20=py38h7b6447c_0
+  - sqlite=3.33.0=h62c20be_0
+  - sympy=1.6.2=py38h06a4308_1
+  - tbb=2020.3=hfd86e86_0
+  - tblib=1.7.0=py_0
+  - terminado=0.9.1=py38_0
+  - testpath=0.4.4=py_0
+  - tifffile=2020.10.1=py38hdd07704_2
+  - tk=8.6.10=hbc83047_0
+  - toml=0.10.1=py_0
+  - toolz=0.11.1=py_0
+  - tornado=6.0.4=py38h7b6447c_1
+  - traitlets=5.0.5=py_0
+  - ujson=4.0.1=py38he6710b0_0
+  - unicodecsv=0.14.1=py38_0
+  - unixodbc=2.3.9=h7b6447c_0
+  - urllib3=1.25.11=py_0
+  - watchdog=0.10.3=py38_0
+  - wcwidth=0.2.5=py_0
+  - webencodings=0.5.1=py38_1
+  - widgetsnbextension=3.5.1=py38_0
+  - wurlitzer=2.0.1=py38_0
+  - xlrd=1.2.0=py_0
+  - xlsxwriter=1.3.7=py_0
+  - xlwt=1.3.0=py38_0
+  - xmltodict=0.12.0=py_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - zeromq=4.3.3=he6710b0_3
+  - zict=2.0.0=py_0
+  - zipp=3.4.0=pyhd3eb1b0_0
+  - zlib=1.2.11=h7b6447c_3
+  - zope=1.0=py38_1
+  - zope.event=4.5.0=py38_0
+  - zope.interface=5.1.2=py38h7b6447c_0
+  - zstd=1.4.5=h9ceee32_0
+  - pip:
+    - absl-py==1.0.0
+    - accelerate==0.27.2
+    - addict==2.4.0
+    - aiohttp==3.8.1
+    - aiohttp-cors==0.7.0
+    - aioredis==2.0.0
+    - aiosignal==1.2.0
+    - annotated-types==0.6.0
+    - antlr4-python3-runtime==4.8
+    - anyio==4.2.0
+    - anykeystore==0.2
+    - apex==0.1
+    - appdirs==1.4.4
+    - argparse==1.4.0
+    - array-record==0.4.0
+    - arrow==1.2.2
+    - ase==3.21.1
+    - astunparse==1.6.3
+    - async-timeout==4.0.1
+    - attrs==23.2.0
+    - autoattack==0.1
+    - backports-zoneinfo==0.2.1
+    - baukit==0.0.1
+    - bdd100k==1.0.0
+    - bert-score==0.3.13
+    - black==22.3.0
+    - blessings==1.7
+    - bleurt==0.0.2
+    - bleurt-pytorch==0.0.1
+    - blis==0.7.11
+    - boto3==1.18.9
+    - botocore==1.21.9
+    - brokenaxes==0.4.2
+    - brotli==1.0.9
+    - cachetools==4.2.1
+    - carla==0.9.12
+    - catalogue==2.0.10
+    - charset-normalizer==2.0.8
+    - chex==0.1.4
+    - cityscapesscripts==2.2.0
+    - clang==5.0
+    - click==8.0.3
+    - clip==1.0
+    - cloudpathlib==0.16.0
+    - cloudpickle==1.3.0
+    - clu==0.0.10
+    - cmake==3.24.1.1
+    - coloredlogs==15.0.1
+    - commonmark==0.9.1
+    - confection==0.1.4
+    - contourpy==1.1.0
+    - coverage==6.3.2
+    - cryptacular==1.5.5
+    - cymem==2.0.8
+    - dalle-mini==0.1.1
+    - dash==2.0.0
+    - dash-bootstrap-components==1.0.1
+    - dash-core-components==2.0.0
+    - dash-html-components==2.0.0
+    - dash-table==5.0.0
+    - dataclasses==0.6
+    - datasets==2.17.1
+    - deprecated==1.2.13
+    - descartes==1.1.0
+    - diffdist==0.1
+    - diffusers==0.3.0
+    - dill==0.3.6
+    - diskcache==5.6.3
+    - distro==1.9.0
+    - dm-tree==0.1.6
+    - dnspython==2.6.1
+    - docker-pycreds==0.4.0
+    - docstring-parser==0.15
+    - easydict==1.10
+    - editdistance==0.8.1
+    - einops==0.3.2
+    - email-validator==2.2.0
+    - emoji==2.0.0
+    - etils==1.3.0
+    - evaluate==0.3.0
+    - exceptiongroup==1.2.0
+    - fairscale==0.4.12
+    - faiss-gpu==1.7.1.post2
+    - fastapi==0.111.0
+    - fastapi-cli==0.0.4
+    - fastchat==0.1.0
+    - filelock==3.15.3
+    - fire==0.4.0
+    - flake8==3.7.9
+    - flake8-import-order==0.18.1
+    - flash-attn==2.5.9.post1
+    - flask==2.0.2
+    - flask-compress==1.10.1
+    - flask-cors==3.0.10
+    - flatbuffers==23.5.26
+    - flax==0.5.3
+    - fonttools==4.31.1
+    - frozenlist==1.2.0
+    - fsspec==2023.10.0
+    - ftfy==6.1.1
+    - functorch==1.13.1
+    - fvcore==0.1.5.post20211023
+    - gast==0.3.3
+    - gdown==4.6.0
+    - gdrive==0.1.5
+    - gin-config==0.5.0
+    - gitdb==4.0.9
+    - gitpython==3.1.27
+    - gmplot==1.4.1
+    - google-api-core==2.11.0
+    - google-api-python-client==2.43.0
+    - google-auth==2.28.1
+    - google-auth-httplib2==0.1.0
+    - google-auth-oauthlib==1.0.0
+    - google-pasta==0.2.0
+    - googleapis-common-protos==1.57.0
+    - googledrivedownloader==0.4
+    - gpustat==0.6.0
+    - grpcio==1.60.1
+    - gym==0.17.1
+    - h11==0.14.0
+    - higher==0.2.1
+    - httpcore==1.0.3
+    - httplib2==0.21.0
+    - httptools==0.6.1
+    - httpx==0.26.0
+    - huggingface-hub==0.23.4
+    - humanfriendly==9.2
+    - hupper==1.10.3
+    - hydra-core==1.1.1
+    - immutabledict==4.1.0
+    - importlib-metadata==7.0.1
+    - importlib-resources==6.4.0
+    - interegular==0.3.3
+    - invisible-watermark==0.1.5
+    - invoke==1.6.0
+    - iopath==0.1.9
+    - ipdb==0.13.4
+    - isodate==0.6.0
+    - isort==4.3.21
+    - itsdangerous==2.0.1
+    - jax==0.4.13
+    - jaxlib==0.4.13
+    - jeepney==0.8.0
+    - jinja2==3.0.3
+    - jmespath==0.10.0
+    - joblib==1.1.0
+    - jsonlines==4.0.0
+    - keras==2.12.0
+    - keras-applications==1.0.8
+    - keras-nightly==2.5.0.dev2021032900
+    - keras-preprocessing==1.1.2
+    - kornia==0.6.0
+    - langcodes==3.3.0
+    - langdetect==1.0.9
+    - lark==1.1.9
+    - lazy-loader==0.1rc2
+    - libclang==14.0.1
+    - littleutils==0.2.2
+    - lm-format-enforcer==0.10.1
+    - lz4==3.1.10
+    - markdown==3.3.4
+    - markupsafe==2.0.1
+    - matplotlib==3.7.0
+    - mesh-tensorflow==0.1.21
+    - ml-collections==0.1.1
+    - ml-dtypes==0.2.0
+    - mlconfig==0.1.0
+    - mmcv==1.2.7
+    - mmcv-full==1.2.7
+    - mmpycocotools==12.0.3
+    - motmetrics==1.2.0
+    - multidict==5.2.0
+    - multiprocess==0.70.14
+    - multiscaledeformableattention==1.0
+    - murmurhash==1.0.10
+    - mypy-extensions==0.4.3
+    - nanoid==2.0.0
+    - ninja==1.11.1.1
+    - nltk==3.8.1
+    - numpy==1.23.4
+    - numpy-quaternion==2022.2.10.14.20.39
+    - nuscenes-devkit==1.1.7
+    - nvidia-cublas-cu11==11.10.3.66
+    - nvidia-cublas-cu12==12.1.3.1
+    - nvidia-cuda-cupti-cu12==12.1.105
+    - nvidia-cuda-nvrtc-cu11==11.7.99
+    - nvidia-cuda-nvrtc-cu12==12.1.105
+    - nvidia-cuda-runtime-cu11==11.7.99
+    - nvidia-cuda-runtime-cu12==12.1.105
+    - nvidia-cudnn-cu11==8.5.0.96
+    - nvidia-cudnn-cu12==8.9.2.26
+    - nvidia-cufft-cu12==11.0.2.54
+    - nvidia-curand-cu12==10.3.2.106
+    - nvidia-cusolver-cu12==11.4.5.107
+    - nvidia-cusparse-cu12==12.1.0.106
+    - nvidia-ml-py==12.555.43
+    - nvidia-ml-py3==7.352.0
+    - nvidia-nccl-cu12==2.20.5
+    - nvidia-nvjitlink-cu12==12.5.40
+    - nvidia-nvtx-cu12==12.1.105
+    - oauthlib==3.1.0
+    - ogb==1.3.5
+    - omegaconf==2.1.1
+    - onnx==1.12.0
+    - onnxruntime==1.12.1
+    - openai==0.25.0
+    - opencensus==0.8.0
+    - opencensus-context==0.1.2
+    - opencv-contrib-python==4.5.5.62
+    - opencv-python==4.6.0.66
+    - openpyxl==3.1.2
+    - opt-einsum==3.3.0
+    - optax==0.1.3
+    - orjson==3.10.5
+    - outdated==0.2.1
+    - outlines==0.0.45
+    - packaging==21.3
+    - pandas==1.3.5
+    - pandas-flavor==0.3.0
+    - pandas-stubs==2.0.3.230814
+    - parameterized==0.8.1
+    - pascal-voc-tools==0.1.29
+    - pastedeploy==2.1.1
+    - pathspec==0.9.0
+    - patsy==0.5.2
+    - pbkdf2==1.3
+    - pbr==5.5.1
+    - pdf2image==1.15.1
+    - peft==0.11.1
+    - pingouin==0.5.1
+    - pip==21.1
+    - plaster==1.0
+    - plaster-pastedeploy==0.7
+    - platformdirs==2.5.2
+    - plotly==5.4.0
+    - plyfile==0.7.4
+    - portalocker==2.2.1
+    - preshed==3.0.9
+    - prettytable==0.7.2
+    - prometheus-client==0.20.0
+    - prometheus-fastapi-instrumentator==7.0.0
+    - promise==2.3
+    - protobuf==3.20.3
+    - py-cpuinfo==8.0.0
+    - py-spy==0.3.11
+    - py-term==0.6
+    - pyairports==2.1.1
+    - pyarrow==15.0.0
+    - pyarrow-hotfix==0.6
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pycocotools==2.0.2
+    - pycodestyle==2.5.0
+    - pycountry==24.6.1
+    - pydantic==2.6.1
+    - pydantic-core==2.16.2
+    - pydeprecate==0.3.2
+    - pydot==1.4.2
+    - pyflakes==2.1.1
+    - pygame==2.1.0
+    - pyglet==1.5.0
+    - pyglove==0.4.4
+    - pyhumps==3.0.2
+    - pynndescent==0.5.2
+    - pyproj==3.3.0
+    - pyquaternion==0.9.9
+    - pyramid==2.0
+    - pyramid-mailer==0.15.1
+    - pyre-extensions==0.0.23
+    - pytest==7.0.1
+    - pytest-benchmark==3.4.1
+    - pytest-cov==2.12.1
+    - python-dateutil==2.8.2
+    - python-dotenv==1.0.1
+    - python-louvain==0.15
+    - python-multipart==0.0.9
+    - python3-openid==3.2.0
+    - pytorch-lightning==1.7.7
+    - pytz==2021.3
+    - pytz-deprecation-shim==0.1.0.post0
+    - pyyaml==6.0
+    - ray==2.10.0
+    - rdflib==5.0.0
+    - redis==4.0.2
+    - referencing==0.35.1
+    - regex==2023.12.25
+    - repoze-sendmail==4.4.1
+    - requests==2.32.3
+    - requests-oauthlib==1.3.0
+    - responses==0.18.0
+    - rich==11.2.0
+    - rouge-score==0.1.2
+    - rpds-py==0.18.1
+    - rpy2==3.5.1
+    - rsa==4.7.2
+    - s3transfer==0.5.0
+    - sacrebleu==2.4.0
+    - safetensors==0.4.2
+    - scalabel==0.3.0rc2
+    - scikit-learn==1.0.1
+    - scikit-plot==0.3.7
+    - scipy==1.10.1
+    - secretstorage==3.3.3
+    - selfcheckgpt==0.1.6
+    - sentencepiece==0.1.99
+    - sentry-sdk==1.9.0
+    - seqio-nightly==0.0.18.dev20240215
+    - setproctitle==1.3.2
+    - setuptools==59.6.0
+    - shapely==1.7.1
+    - shellingham==1.5.4
+    - shortuuid==1.0.9
+    - six==1.16.0
+    - sklearn==0.0
+    - smart-open==6.4.0
+    - smmap==5.0.0
+    - sniffio==1.3.0
+    - spacy==3.7.4
+    - spacy-legacy==3.0.12
+    - spacy-loggers==1.0.5
+    - sphinxcontrib-apidoc==0.3.0
+    - srsly==2.4.8
+    - starlette==0.37.2
+    - statsmodels==0.13.2
+    - svgwrite==1.4.1
+    - t5==0.7.0
+    - tabulate==0.8.9
+    - taming-transformers==0.0.1
+    - tb-nightly==2.11.0a20221104
+    - tenacity==8.0.1
+    - tensorboard==2.12.3
+    - tensorboard-data-server==0.7.2
+    - tensorboard-logger==0.1.0
+    - tensorboard-plugin-wit==1.8.0
+    - tensorboardx==2.2
+    - tensorflow==2.12.1
+    - tensorflow-addons==0.17.0
+    - tensorflow-datasets==4.9.2
+    - tensorflow-estimator==2.12.0
+    - tensorflow-hub==0.16.1
+    - tensorflow-io==0.26.0
+    - tensorflow-io-gcs-filesystem==0.26.0
+    - tensorflow-metadata==1.8.0
+    - tensorflow-text==2.12.1
+    - tensorstore==0.1.22
+    - termcolor==1.1.0
+    - tf-keras==2.15.0
+    - tf-slim==1.1.0
+    - tfds-nightly==4.9.2.dev202308090034
+    - tflearn==0.5.0
+    - thinc==8.2.3
+    - threadpoolctl==3.0.0
+    - tiktoken==0.7.0
+    - timm==0.4.12
+    - tokenizers==0.19.1
+    - tomli==2.0.1
+    - torch==1.6.0
+    - torch-cluster==1.5.9
+    - torch-geometric==1.6.3
+    - torch-scatter==2.0.6
+    - torch-sparse==0.6.9
+    - torch-spline-conv==1.2.1
+    - torchaudio==0.11.0+cu113
+    - torchlars==0.1.2
+    - torchmetrics==0.9.3
+    - torchvision==0.14.1
+    - tqdm==4.62.3
+    - transaction==3.0.1
+    - transformers==4.42.3
+    - translationstring==1.4
+    - tree==0.2.4
+    - triton==2.3.0
+    - typeguard==2.13.3
+    - typer==0.12.3
+    - types-pytz==2024.1.0.20240203
+    - typing==3.7.4.3
+    - typing-extensions==4.10.0
+    - typing-inspect==0.8.0
+    - tzdata==2022.1
+    - tzlocal==4.2
+    - umap-learn==0.5.1
+    - uncertainty-calibration==0.0.8
+    - unidecode==1.3.4
+    - unrar==0.4
+    - uritemplate==4.1.1
+    - uvicorn==0.30.1
+    - uvloop==0.19.0
+    - velruse==1.1.1
+    - venusian==3.0.0
+    - versioneer==0.28
+    - vllm==0.5.0.post1
+    - vllm-flash-attn==2.5.9
+    - vqgan-jax==0.0.1
+    - wand==0.6.5
+    - wandb==0.13.1
+    - wasabi==1.1.2
+    - watchfiles==0.22.0
+    - waymo-open-dataset-tf-2-3-0==1.3.1
+    - weasel==0.3.4
+    - webob==1.8.7
+    - websockets==12.0
+    - werkzeug==2.0.2
+    - wheel==0.37.1
+    - wilds==1.2.2
+    - wrapt==1.12.1
+    - wtforms==2.3.3
+    - wtforms-recaptcha==0.3.2
+    - xarray==2022.3.0
+    - xformers==0.0.26.post1
+    - xxhash==3.4.1
+    - yacs==0.1.8
+    - yapf==0.29.0
+    - yarl==1.7.2
+    - ylib==0.1.0
+    - zope-deprecation==4.4.0
+    - zope-sqlalchemy==1.4
+prefix: /u/x/f/xfdu/anaconda3
--- a/hal_det_llama.py
+++ b/hal_det_llama.py
@ -0,0 +1,670 @@
+import os
+import torch
+import torch.nn.functional as F
+import evaluate
+from datasets import load_metric
+from datasets import load_dataset
+from tqdm import tqdm
+import numpy as np
+import pickle
+from utils import get_llama_activations_bau, tokenized_tqa, tokenized_tqa_gen, tokenized_tqa_gen_end_q
+import llama_iti
+import pickle
+import argparse
+import matplotlib.pyplot as plt
+from pprint import pprint
+from baukit import Trace, TraceDict
+from metric_utils import get_measures, print_measures
+import re
+from torch.autograd import Variable
+
+
+
+def seed_everything(seed: int):
+    import random, os
+    import numpy as np
+    import torch
+
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+
+HF_NAMES = {
+    'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
+    'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15',
+    'alpaca_7B': 'circulus/alpaca-7b',
+    'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
+    'llama2_chat_7B': 'models/Llama-2-7b-chat-hf',
+    'llama2_chat_13B': 'models/Llama-2-13b-chat-hf',
+    'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
+}
+
+
+def main():
+
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default='llama2_chat_7B')
+    parser.add_argument('--dataset_name', type=str, default='tqa')
+    parser.add_argument('--num_gene', type=int, default=1)
+    parser.add_argument('--gene', type=int, default=0)
+    parser.add_argument('--generate_gt', type=int, default=0)
+    parser.add_argument('--use_rouge', type=int, default=0)
+    parser.add_argument('--weighted_svd', type=int, default=0)
+    parser.add_argument('--feat_loc_svd', type=int, default=0)
+    parser.add_argument('--wild_ratio', type=float, default=0.75)
+    parser.add_argument('--thres_gt', type=float, default=0.5)
+    parser.add_argument('--most_likely', type=int, default=0)
+
+    parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
+    args = parser.parse_args()
+
+    MODEL = HF_NAMES[args.model_name] if not args.model_dir else args.model_dir
+
+
+
+
+    if args.dataset_name == "tqa":
+        dataset = load_dataset("truthful_qa", 'generation')['validation']
+    elif args.dataset_name == 'triviaqa':
+        dataset = load_dataset("trivia_qa", "rc.nocontext", split="validation")
+        id_mem = set()
+
+        def remove_dups(batch):
+            if batch['question_id'][0] in id_mem:
+                return {_: [] for _ in batch.keys()}
+            id_mem.add(batch['question_id'][0])
+            return batch
+
+        dataset = dataset.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False)
+    elif args.dataset_name == 'tydiqa':
+        dataset = datasets.load_dataset("tydiqa", "secondary_task", split="train")
+        used_indices = []
+        for i in range(len(dataset)):
+            if 'english' in dataset[i]['id']:
+                used_indices.append(i)
+    elif args.dataset_name == 'coqa':
+        import json
+        import pandas as pd
+        from datasets import Dataset
+
+        def _save_dataset():
+            # https://github.com/lorenzkuhn/semantic_uncertainty/blob/main/code/parse_coqa.py
+            save_path = f'./coqa_dataset'
+            if not os.path.exists(save_path):
+                # https://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json
+                with open(f'./coqa-dev-v1.0.json', 'r') as infile:
+                    data = json.load(infile)['data']
+
+                dataset = {}
+
+                dataset['story'] = []
+                dataset['question'] = []
+                dataset['answer'] = []
+                dataset['additional_answers'] = []
+                dataset['id'] = []
+
+                for sample_id, sample in enumerate(data):
+                    story = sample['story']
+                    questions = sample['questions']
+                    answers = sample['answers']
+                    additional_answers = sample['additional_answers']
+                    for question_index, question in enumerate(questions):
+                        dataset['story'].append(story)
+                        dataset['question'].append(question['input_text'])
+                        dataset['answer'].append({
+                            'text': answers[question_index]['input_text'],
+                            'answer_start': answers[question_index]['span_start']
+                        })
+                        dataset['id'].append(sample['id'] + '_' + str(question_index))
+                        additional_answers_list = []
+
+                        for i in range(3):
+                            additional_answers_list.append(additional_answers[str(i)][question_index]['input_text'])
+
+                        dataset['additional_answers'].append(additional_answers_list)
+                        story = story + ' Q: ' + question['input_text'] + ' A: ' + answers[question_index]['input_text']
+                        if not story[-1] == '.':
+                            story = story + '.'
+
+                dataset_df = pd.DataFrame.from_dict(dataset)
+
+                dataset = Dataset.from_pandas(dataset_df)
+
+                dataset.save_to_disk(save_path)
+            return save_path
+
+        # dataset = datasets.load_from_disk(_save_dataset())
+        def get_dataset(tokenizer, split='validation'):
+            # from https://github.com/lorenzkuhn/semantic_uncertainty/blob/main/code/parse_coqa.py
+            dataset = datasets.load_from_disk(_save_dataset())
+            id_to_question_mapping = dict(zip(dataset['id'], dataset['question']))
+
+            def encode_coqa(example):
+                example['answer'] = [example['answer']['text']] + example['additional_answers']
+                example['prompt'] = prompt = example['story'] + ' Q: ' + example['question'] + ' A:'
+                return tokenizer(prompt, truncation=False, padding=False)
+
+            dataset = dataset.map(encode_coqa, batched=False, load_from_cache_file=False)
+            dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'], output_all_columns=True)
+            return dataset
+
+        dataset = get_dataset(llama_iti.LlamaTokenizer.from_pretrained(MODEL, trust_remote_code=True))
+    else:
+        raise ValueError("Invalid dataset name")
+
+    if args.gene:
+        tokenizer = llama_iti.LlamaTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+        model = llama_iti.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16,
+                                                           device_map="auto").cuda()
+
+        begin_index = 0
+        if args.dataset_name == 'tydiqa':
+            end_index = len(used_indices)
+        else:
+            end_index = len(dataset)
+
+        if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/'):
+            os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/')
+
+
+        if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/answers'):
+            os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/answers')
+
+        period_token_id = [tokenizer(_)['input_ids'][-1] for _ in ['\n']]
+        period_token_id += [tokenizer.eos_token_id]
+
+        for i in range(begin_index, end_index):
+            answers = [None] * args.num_gene
+            if args.dataset_name == 'tydiqa':
+                question = dataset[int(used_indices[i])]['question']
+                prompt = tokenizer(
+                    "Concisely answer the following question based on the information in the given passage: \n" + \
+                    " Passage: " + dataset[int(used_indices[i])]['context'] + " \n Q: " + question + " \n A:",
+                    return_tensors='pt').input_ids.cuda()
+            elif args.dataset_name == 'coqa':
+                prompt = tokenizer(
+                    dataset[i]['prompt'], return_tensors='pt').input_ids.cuda()
+            else:
+                question = dataset[i]['question']
+                prompt = tokenizer(f"Answer the question concisely. Q: {question}" + " A:", return_tensors='pt').input_ids.cuda()
+            for gen_iter in range(args.num_gene):
+                if args.most_likely:
+                    generated = model.generate(prompt,
+                                                num_beams=5,
+                                                num_return_sequences=1,
+                                                do_sample=False,
+                                                max_new_tokens=64,
+                                               )
+                else:
+                    generated = model.generate(prompt,
+                                                do_sample=True,
+                                                num_return_sequences=1,
+                                                num_beams=1,
+                                                max_new_tokens=64,
+                                                temperature=0.5,
+                                                top_p=1.0)
+
+
+                decoded = tokenizer.decode(generated[0, prompt.shape[-1]:],
+                                           skip_special_tokens=True)
+                if args.dataset_name == 'tqa' or args.dataset_name == 'triviaqa':
+                    # corner case.
+                    if 'Answer the question concisely' in decoded:
+                        print('#####error')
+                        print(decoded.split('Answer the question concisely')[1])
+                        print('#####error')
+                        decoded = decoded.split('Answer the question concisely')[0]
+                if args.dataset_name == 'coqa':
+                    if 'Q:' in decoded:
+                        print('#####error')
+                        print(decoded.split('Q:')[1])
+                        print('#####error')
+                        decoded = decoded.split('Q:')[0]
+                print(decoded)
+                answers[gen_iter] = decoded
+
+
+            print('sample: ', i)
+            if args.most_likely:
+                info = 'most_likely_'
+            else:
+                info = 'batch_generations_'
+            print("Saving answers")
+            np.save(f'./save_for_eval/{args.dataset_name}_hal_det/answers/' + info + f'hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy',
+                    answers)
+    elif args.generate_gt:
+        from bleurt_pytorch import BleurtConfig, BleurtForSequenceClassification, BleurtTokenizer
+
+        model = BleurtForSequenceClassification.from_pretrained('./models/BLEURT-20').cuda()
+        tokenizer = BleurtTokenizer.from_pretrained('./models/BLEURT-20')
+        model.eval()
+
+        rouge = evaluate.load('rouge')
+        gts = np.zeros(0)
+        if args.dataset_name == 'tydiqa':
+            length = len(used_indices)
+        else:
+            length = len(dataset)
+        for i in range(length):
+            if args.dataset_name == 'tqa':
+                best_answer = dataset[i]['best_answer']
+                correct_answer = dataset[i]['correct_answers']
+                all_answers = [best_answer] + correct_answer
+            elif args.dataset_name == 'triviaqa':
+                all_answers = dataset[i]['answer']['aliases']
+            elif args.dataset_name == 'coqa':
+                all_answers = dataset[i]['answer']
+            elif args.dataset_name == 'tydiqa':
+                all_answers = dataset[int(used_indices[i])]['answers']['text']
+
+            if args.most_likely:
+                answers = np.load(
+                    f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+            else:
+                answers = np.load(
+                    f'./save_for_eval/{args.dataset_name}_hal_det/answers/batch_generations_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+            # get the gt.
+            if args.use_rouge:
+
+                predictions = answers
+                all_results = np.zeros((len(all_answers), len(predictions)))
+                all_results1 = np.zeros((len(all_answers), len(predictions)))
+                all_results2 = np.zeros((len(all_answers), len(predictions)))
+                for anw in range(len(all_answers)):
+                    results = rouge.compute(predictions=predictions,
+                                            references=[all_answers[anw]] * len(predictions),
+                                            use_aggregator=False)
+                    all_results[anw] = results['rougeL']
+                    all_results1[anw] = results['rouge1']
+                    all_results2[anw] = results['rouge2']
+
+                # breakpoint()
+                gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
+
+                if i % 50 == 0:
+                    print("samples passed: ", i)
+            else:
+
+                predictions = answers
+                all_results = np.zeros((len(all_answers), len(predictions)))
+                with torch.no_grad():
+                    for anw in range(len(all_answers)):
+                        inputs = tokenizer(predictions.tolist(), [all_answers[anw]] * len(predictions),
+                                           padding='longest', return_tensors='pt')
+                        for key in list(inputs.keys()):
+                            inputs[key] = inputs[key].cuda()
+                        res = np.asarray(model(**inputs).logits.flatten().tolist())
+                        all_results[anw] = res
+                gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
+                if i % 10 == 0:
+                    print("samples passed: ", i)
+        # breakpoint()
+        if args.most_likely:
+            if args.use_rouge:
+                np.save(f'./ml_{args.dataset_name}_rouge_score.npy', gts)
+            else:
+                np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
+        else:
+            if args.use_rouge:
+                np.save(f'./bg_{args.dataset_name}_rouge_score.npy', gts)
+            else:
+                np.save(f'./bg_{args.dataset_name}_bleurt_score.npy', gts)
+
+    else:
+        tokenizer = llama_iti.LlamaTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+        model = llama_iti.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True,
+                                                           torch_dtype=torch.float16,
+                                                           device_map="auto").cuda()
+        # firstly get the embeddings of the generated question and answers.
+        embed_generated = []
+
+        if args.dataset_name == 'tydiqa':
+            length = len(used_indices)
+        else:
+            length = len(dataset)
+        for i in tqdm(range(length)):
+            if args.dataset_name == 'tydiqa':
+                question = dataset[int(used_indices[i])]['question']
+            else:
+                question = dataset[i]['question']
+            answers = np.load(
+                f'save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+
+            for anw in answers:
+
+                if args.dataset_name == 'tydiqa':
+                    prompt = tokenizer(
+                        "Concisely answer the following question based on the information in the given passage: \n" + \
+                        " Passage: " + dataset[int(used_indices[i])]['context'] + " \n Q: " + question + " \n A:",
+                        return_tensors='pt').input_ids.cuda()
+                elif args.dataset_name == 'coqa':
+                    prompt = tokenizer(dataset[i]['prompt'] + anw, return_tensors='pt').input_ids.cuda()
+                else:
+                    prompt = tokenizer(
+                        f"Answer the question concisely. Q: {question}" + " A:" + anw,
+                        return_tensors='pt').input_ids.cuda()
+                with torch.no_grad():
+                    hidden_states = model(prompt, output_hidden_states=True).hidden_states
+                    hidden_states = torch.stack(hidden_states, dim=0).squeeze()
+                    hidden_states = hidden_states.detach().cpu().numpy()[:, -1, :]
+                    embed_generated.append(hidden_states)
+        embed_generated = np.asarray(np.stack(embed_generated), dtype=np.float32)
+        np.save(f'save_for_eval/{args.dataset_name}_hal_det/most_likely_{args.model_name}_gene_embeddings_layer_wise.npy', embed_generated)
+
+        HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)]
+        MLPS = [f"model.layers.{i}.mlp" for i in range(model.config.num_hidden_layers)]
+        embed_generated_loc2 = []
+        embed_generated_loc1 = []
+        for i in tqdm(range(length)):
+            if args.dataset_name == 'tydiqa':
+                question = dataset[int(used_indices[i])]['question']
+            else:
+                question = dataset[i]['question']
+
+
+            answers = np.load(
+                f'save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+            for anw in answers:
+                if args.dataset_name == 'tydiqa':
+                    prompt = tokenizer(
+                        "Concisely answer the following question based on the information in the given passage: \n" + \
+                        " Passage: " + dataset[int(used_indices[i])]['context'] + " \n Q: " + question + " \n A:",
+                        return_tensors='pt').input_ids.cuda()
+                elif args.dataset_name == 'coqa':
+                    prompt = tokenizer(dataset[i]['prompt'] + anw, return_tensors='pt').input_ids.cuda()
+                else:
+                    prompt = tokenizer(
+                        f"Answer the question concisely. Q: {question}" + " A:" + anw,
+                        return_tensors='pt').input_ids.cuda()
+
+                with torch.no_grad():
+                    with TraceDict(model, HEADS + MLPS) as ret:
+                        output = model(prompt, output_hidden_states=True)
+                    head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
+                    head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
+                    mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
+                    mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
+
+                    embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
+                    embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
+        embed_generated_loc2 = np.asarray(np.stack(embed_generated_loc2), dtype=np.float32)
+        embed_generated_loc1 = np.asarray(np.stack(embed_generated_loc1), dtype=np.float32)
+
+        np.save(f'save_for_eval/{args.dataset_name}_hal_det/most_likely_{args.model_name}_gene_embeddings_head_wise.npy', embed_generated_loc1)
+        np.save(f'save_for_eval/{args.dataset_name}_hal_det/most_likely_{args.model_name}_embeddings_mlp_wise.npy',  embed_generated_loc2)
+
+
+
+        # get the split and label (true or false) of the unlabeled data and the test data.
+        if args.use_rouge:
+            gts = np.load(f'./ml_{args.dataset_name}_rouge_score.npy')
+            gts_bg = np.load(f'./bg_{args.dataset_name}_rouge_score.npy')
+        else:
+            gts = np.load(f'./ml_{args.dataset_name}_bleurt_score.npy')
+            gts_bg = np.load(f'./bg_{args.dataset_name}_bleurt_score.npy')
+        thres = args.thres_gt
+        gt_label = np.asarray(gts> thres, dtype=np.int32)
+        gt_label_bg = np.asarray(gts_bg > thres, dtype=np.int32)
+
+
+        if args.dataset_name == 'tydiqa':
+            length = len(used_indices)
+        else:
+            length = len(dataset)
+
+
+        permuted_index = np.random.permutation(length)
+        wild_q_indices = permuted_index[:int(args.wild_ratio * length)]
+        # exclude validation samples.
+        wild_q_indices1 = wild_q_indices[:len(wild_q_indices) - 100]
+        wild_q_indices2 = wild_q_indices[len(wild_q_indices) - 100:]
+        gt_label_test = []
+        gt_label_wild = []
+        gt_label_val = []
+        for i in range(length):
+            if i not in wild_q_indices:
+                gt_label_test.extend(gt_label[i: i+1])
+            elif i in wild_q_indices1:
+                gt_label_wild.extend(gt_label[i: i+1])
+            else:
+                gt_label_val.extend(gt_label[i: i+1])
+        gt_label_test = np.asarray(gt_label_test)
+        gt_label_wild = np.asarray(gt_label_wild)
+        gt_label_val = np.asarray(gt_label_val)
+
+
+
+
+        def svd_embed_score(embed_generated_wild, gt_label, begin_k, k_span, mean=1, svd=1, weight=0):
+            embed_generated = embed_generated_wild
+            best_auroc_over_k = 0
+            best_layer_over_k = 0
+            best_scores_over_k = None
+            best_projection_over_k = None
+            for k in tqdm(range(begin_k, k_span)):
+                best_auroc = 0
+                best_layer = 0
+                best_scores = None
+                mean_recorded = None
+                best_projection = None
+                for layer in range(len(embed_generated_wild[0])):
+                    if mean:
+                        mean_recorded = embed_generated[:, layer, :].mean(0)
+                        centered = embed_generated[:, layer, :] - mean_recorded
+                    else:
+                        centered = embed_generated[:, layer, :]
+
+                    if not svd:
+                        pca_model = PCA(n_components=k, whiten=False).fit(centered)
+                        projection = pca_model.components_.T
+                        mean_recorded = pca_model.mean_
+                        if weight:
+                            projection = pca_model.singular_values_ * projection
+                    else:
+                        _, sin_value, V_p = torch.linalg.svd(torch.from_numpy(centered).cuda())
+                        projection = V_p[:k, :].T.cpu().data.numpy()
+                        if weight:
+                            projection = sin_value[:k] * projection
+
+
+                    scores = np.mean(np.matmul(centered, projection), -1, keepdims=True)
+                    assert scores.shape[1] == 1
+                    scores = np.sqrt(np.sum(np.square(scores), axis=1))
+
+                    # not sure about whether true and false data the direction will point to,
+                    # so we test both. similar practices are in the representation engineering paper
+                    # https://arxiv.org/abs/2310.01405
+                    measures1 = get_measures(scores[gt_label == 1],
+                                             scores[gt_label == 0], plot=False)
+                    measures2 = get_measures(-scores[gt_label == 1],
+                                             -scores[gt_label == 0], plot=False)
+
+                    if measures1[0] > measures2[0]:
+                        measures = measures1
+                        sign_layer = 1
+                    else:
+                        measures = measures2
+                        sign_layer = -1
+
+                    if measures[0] > best_auroc:
+                        best_auroc = measures[0]
+                        best_result = [100 * measures[2], 100 * measures[0]]
+                        best_layer = layer
+                        best_scores = sign_layer * scores
+                        best_projection = projection
+                        best_mean = mean_recorded
+                        best_sign = sign_layer
+                print('k: ', k, 'best result: ', best_result, 'layer: ', best_layer,
+                      'mean: ', mean, 'svd: ', svd)
+
+                if best_auroc > best_auroc_over_k:
+                    best_auroc_over_k = best_auroc
+                    best_result_over_k = best_result
+                    best_layer_over_k = best_layer
+                    best_k = k
+                    best_sign_over_k = best_sign
+                    best_scores_over_k = best_scores
+                    best_projection_over_k = best_projection
+                    best_mean_over_k = best_mean
+
+
+            return {'k': best_k,
+                    'best_layer':best_layer_over_k,
+                    'best_auroc':best_auroc_over_k,
+                    'best_result':best_result_over_k,
+                    'best_scores':best_scores_over_k,
+                    'best_mean': best_mean_over_k,
+                    'best_sign':best_sign_over_k,
+                    'best_projection':best_projection_over_k}
+
+
+        from sklearn.decomposition import PCA
+        feat_loc = args.feat_loc_svd
+
+
+
+        if args.most_likely:
+            if feat_loc == 3:
+                embed_generated = np.load(f'save_for_eval/{args.dataset_name}_hal_det/most_likely_{args.model_name}_gene_embeddings_layer_wise.npy',
+                                  allow_pickle=True)
+            elif feat_loc == 2:
+                embed_generated = np.load(
+                    f'save_for_eval/{args.dataset_name}_hal_det/most_likely_{args.model_name}_gene_embeddings_mlp_wise.npy',
+                    allow_pickle=True)
+            else:
+                embed_generated = np.load(
+                    f'save_for_eval/{args.dataset_name}_hal_det/most_likely_{args.model_name}_gene_embeddings_head_wise.npy',
+                    allow_pickle=True)
+            feat_indices_wild = []
+            feat_indices_eval = []
+
+            if args.dataset_name == 'tydiqa':
+                length = len(used_indices)
+            else:
+                length = len(dataset)
+
+
+            for i in range(length):
+                if i in wild_q_indices1:
+                    feat_indices_wild.extend(np.arange(i, i+1).tolist())
+                elif i in wild_q_indices2:
+                    feat_indices_eval.extend(np.arange(i, i + 1).tolist())
+            if feat_loc == 3:
+                embed_generated_wild = embed_generated[feat_indices_wild][:,1:,:]
+                embed_generated_eval = embed_generated[feat_indices_eval][:, 1:, :]
+            else:
+                embed_generated_wild = embed_generated[feat_indices_wild]
+                embed_generated_eval = embed_generated[feat_indices_eval]
+
+
+
+
+
+        # returned_results = svd_embed_score(embed_generated_wild, gt_label_wild,
+        #                                    1, 11, mean=0, svd=0, weight=args.weighted_svd)
+        # get the best hyper-parameters on validation set
+        returned_results = svd_embed_score(embed_generated_eval, gt_label_val,
+                                           1, 11, mean=0, svd=0, weight=args.weighted_svd)
+
+        pca_model = PCA(n_components=returned_results['k'], whiten=False).fit(embed_generated_wild[:,returned_results['best_layer'],:])
+        projection = pca_model.components_.T
+        if args.weighted_svd:
+            projection = pca_model.singular_values_ * projection
+        scores = np.mean(np.matmul(embed_generated_wild[:,returned_results['best_layer'],:], projection), -1, keepdims=True)
+        assert scores.shape[1] == 1
+        best_scores = np.sqrt(np.sum(np.square(scores), axis=1)) * returned_results['best_sign']
+
+
+
+        # direct projection
+        feat_indices_test = []
+
+        for i in range(length):
+            if i not in wild_q_indices:
+                feat_indices_test.extend(np.arange(1 * i, 1 * i + 1).tolist())
+        if feat_loc == 3:
+            embed_generated_test = embed_generated[feat_indices_test][:, 1:, :]
+        else:
+            embed_generated_test = embed_generated[feat_indices_test]
+
+        test_scores = np.mean(np.matmul(embed_generated_test[:,returned_results['best_layer'],:],
+                                   projection), -1, keepdims=True)
+
+        assert test_scores.shape[1] == 1
+        test_scores = np.sqrt(np.sum(np.square(test_scores), axis=1))
+
+        measures = get_measures(returned_results['best_sign'] * test_scores[gt_label_test == 1],
+                                 returned_results['best_sign'] *test_scores[gt_label_test == 0], plot=False)
+        print_measures(measures[0], measures[1], measures[2], 'direct-projection')
+
+
+        thresholds = np.linspace(0,1, num=40)[1:-1]
+        normalizer = lambda x: x / (np.linalg.norm(x, ord=2, axis=-1, keepdims=True) + 1e-10)
+        auroc_over_thres = []
+        for thres_wild in thresholds:
+            best_auroc = 0
+            for layer in range(len(embed_generated_wild[0])):
+                thres_wild_score = np.sort(best_scores)[int(len(best_scores) * thres_wild)]
+                true_wild = embed_generated_wild[:,layer,:][best_scores > thres_wild_score]
+                false_wild = embed_generated_wild[:,layer,:][best_scores <= thres_wild_score]
+
+                embed_train = np.concatenate([true_wild,false_wild],0)
+                label_train = np.concatenate([np.ones(len(true_wild)),
+                                              np.zeros(len(false_wild))], 0)
+
+
+                ## gt training, saplma
+                # embed_train = embed_generated_wild[:,layer,:]
+                # label_train = gt_label_wild
+                ## gt training, saplma
+                from linear_probe import get_linear_acc
+
+
+
+                best_acc, final_acc, (
+                clf, best_state, best_preds, preds, labels_val), losses_train = get_linear_acc(
+                embed_train,
+                label_train,
+                embed_train,
+                label_train,
+                2, epochs = 50,
+                print_ret = True,
+                batch_size=512,
+                cosine=True,
+                nonlinear = True,
+                learning_rate = 0.05,
+                weight_decay = 0.0003)
+
+
+
+                clf.eval()
+                output = clf(torch.from_numpy(
+                    embed_generated_test[:, layer, :]).cuda())
+                pca_wild_score_binary_cls = torch.sigmoid(output)
+
+
+                pca_wild_score_binary_cls = pca_wild_score_binary_cls.cpu().data.numpy()
+
+                if np.isnan(pca_wild_score_binary_cls).sum() > 0:
+                    breakpoint()
+                measures = get_measures(pca_wild_score_binary_cls[gt_label_test == 1],
+                                        pca_wild_score_binary_cls[gt_label_test == 0], plot=False)
+
+                if measures[0] > best_auroc:
+                    best_auroc = measures[0]
+                    best_result = [100 * measures[0]]
+                    best_layer = layer
+
+            auroc_over_thres.append(best_auroc)
+            print('thres: ', thres_wild, 'best result: ', best_result, 'best_layer: ', best_layer)
+
+
+
+if __name__ == '__main__':
+    seed_everything(42)
+    main()
--- a/hal_det_opt.py
+++ b/hal_det_opt.py
@ -0,0 +1,681 @@
+import os
+import torch
+import torch.nn.functional as F
+import evaluate
+from datasets import load_metric
+from datasets import load_dataset
+from tqdm import tqdm
+import numpy as np
+import pickle
+from utils import get_llama_activations_bau, tokenized_tqa, tokenized_tqa_gen, tokenized_tqa_gen_end_q
+import llama_iti
+import pickle
+import argparse
+import matplotlib.pyplot as plt
+from pprint import pprint
+from baukit import Trace, TraceDict
+from metric_utils import get_measures, print_measures
+import re
+from torch.autograd import Variable
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def seed_everything(seed: int):
+    import random, os
+    import numpy as np
+    import torch
+
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+
+HF_NAMES = {
+    'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
+    'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15',
+    'alpaca_7B': 'circulus/alpaca-7b',
+    'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
+    'llama2_chat_7B': 'models/Llama-2-7b-chat-hf',
+    'llama2_chat_13B': 'models/Llama-2-13b-chat-hf',
+    "opt-6.7b": "models/opt-6.7b",
+    "opt-13b": "models/opt-13b",
+}
+
+
+def main():
+
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default='opt-6.7b')
+    parser.add_argument('--dataset_name', type=str, default='tqa')
+    parser.add_argument('--num_gene', type=int, default=1)
+    parser.add_argument('--gene', type=int, default=0)
+    parser.add_argument('--generate_gt', type=int, default=0)
+    parser.add_argument('--use_rouge', type=int, default=0)
+    parser.add_argument('--weighted_svd', type=int, default=0)
+    parser.add_argument('--feat_loc_svd', type=int, default=0)
+    parser.add_argument('--wild_ratio', type=float, default=0.75)
+    parser.add_argument('--thres_gt', type=float, default=0.5)
+    parser.add_argument('--most_likely', type=int, default=0)
+
+    parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
+    args = parser.parse_args()
+
+    MODEL = HF_NAMES[args.model_name] if not args.model_dir else args.model_dir
+
+
+
+
+    if args.dataset_name == "tqa":
+        dataset = load_dataset("truthful_qa", 'generation')['validation']
+    elif args.dataset_name == 'triviaqa':
+        dataset = load_dataset("trivia_qa", "rc.nocontext", split="validation")
+        id_mem = set()
+
+        def remove_dups(batch):
+            if batch['question_id'][0] in id_mem:
+                return {_: [] for _ in batch.keys()}
+            id_mem.add(batch['question_id'][0])
+            return batch
+
+        dataset = dataset.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False)
+    elif args.dataset_name == 'tydiqa':
+        dataset = datasets.load_dataset("tydiqa", "secondary_task", split="train")
+        used_indices = []
+        for i in range(len(dataset)):
+            if 'english' in dataset[i]['id']:
+                used_indices.append(i)
+    elif args.dataset_name == 'coqa':
+        import json
+        import pandas as pd
+        from datasets import Dataset
+
+        def _save_dataset():
+            # https://github.com/lorenzkuhn/semantic_uncertainty/blob/main/code/parse_coqa.py
+            save_path = f'./coqa_dataset'
+            if not os.path.exists(save_path):
+                # https://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json
+                with open(f'./coqa-dev-v1.0.json', 'r') as infile:
+                    data = json.load(infile)['data']
+
+                dataset = {}
+
+                dataset['story'] = []
+                dataset['question'] = []
+                dataset['answer'] = []
+                dataset['additional_answers'] = []
+                dataset['id'] = []
+
+                for sample_id, sample in enumerate(data):
+                    story = sample['story']
+                    questions = sample['questions']
+                    answers = sample['answers']
+                    additional_answers = sample['additional_answers']
+                    for question_index, question in enumerate(questions):
+                        dataset['story'].append(story)
+                        dataset['question'].append(question['input_text'])
+                        dataset['answer'].append({
+                            'text': answers[question_index]['input_text'],
+                            'answer_start': answers[question_index]['span_start']
+                        })
+                        dataset['id'].append(sample['id'] + '_' + str(question_index))
+                        additional_answers_list = []
+
+                        for i in range(3):
+                            additional_answers_list.append(additional_answers[str(i)][question_index]['input_text'])
+
+                        dataset['additional_answers'].append(additional_answers_list)
+                        story = story + ' Q: ' + question['input_text'] + ' A: ' + answers[question_index]['input_text']
+                        if not story[-1] == '.':
+                            story = story + '.'
+
+                dataset_df = pd.DataFrame.from_dict(dataset)
+
+                dataset = Dataset.from_pandas(dataset_df)
+
+                dataset.save_to_disk(save_path)
+            return save_path
+
+        # dataset = datasets.load_from_disk(_save_dataset())
+        def get_dataset(tokenizer, split='validation'):
+            # from https://github.com/lorenzkuhn/semantic_uncertainty/blob/main/code/parse_coqa.py
+            dataset = datasets.load_from_disk(_save_dataset())
+            id_to_question_mapping = dict(zip(dataset['id'], dataset['question']))
+
+            def encode_coqa(example):
+                example['answer'] = [example['answer']['text']] + example['additional_answers']
+                example['prompt'] = prompt = example['story'] + ' Q: ' + example['question'] + ' A:'
+                return tokenizer(prompt, truncation=False, padding=False)
+
+            dataset = dataset.map(encode_coqa, batched=False, load_from_cache_file=False)
+            dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'], output_all_columns=True)
+            return dataset
+
+        dataset = get_dataset(llama_iti.LlamaTokenizer.from_pretrained(MODEL, trust_remote_code=True))
+    else:
+        raise ValueError("Invalid dataset name")
+
+    if args.gene:
+
+        model = AutoModelForCausalLM.from_pretrained(MODEL,
+                                                     torch_dtype=torch.float16).cuda()
+        tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
+
+        begin_index = 0
+        if args.dataset_name == 'tydiqa':
+            end_index = len(used_indices)
+        else:
+            end_index = len(dataset)
+
+        if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det_opt/'):
+            os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det_opt/')
+
+
+        if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det_opt/answers'):
+            os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det_opt/answers')
+
+
+
+        period_token_id = tokenizer('. ')['input_ids'][1]
+        eos_tokens = ['Question:', ' Question:', '\n', 'Answer:', ' Answer:', 'Q:']
+        question_framing_ids = [[tokenizer(eos_token)['input_ids'][1]] for eos_token in eos_tokens]
+
+
+
+        for i in range(begin_index, end_index):
+            answers = [None] * args.num_gene
+            if args.dataset_name == 'tydiqa':
+                question = dataset[int(used_indices[i])]['question']
+                prompt = tokenizer(
+                    "Concisely answer the following question based on the information in the given passage: \n" + \
+                    " Passage: " + dataset[int(used_indices[i])]['context'] + " \n Q: " + question + " \n A:",
+                    return_tensors='pt').input_ids.cuda()
+            elif args.dataset_name == 'coqa':
+                prompt = tokenizer(
+                    dataset[i]['prompt'], return_tensors='pt').input_ids.cuda()
+            else:
+                question = dataset[i]['question']
+                prompt = tokenizer(f"Answer the question concisely. Q: {question}" + " A:", return_tensors='pt').input_ids.cuda()
+            for gen_iter in range(args.num_gene):
+                if args.most_likely:
+                    generated = model.generate(prompt,
+                                                num_beams=5,
+                                                num_return_sequences=1,
+                                                do_sample=False,
+                                                max_new_tokens=64,
+                                                eos_token_id=period_token_id,
+                                                bad_words_ids=question_framing_ids
+                                               )
+                else:
+                    generated = model.generate(prompt,
+                                                do_sample=True,
+                                                num_return_sequences=1,
+                                                num_beams=1,
+                                                max_new_tokens=64,
+                                                temperature=0.5,
+                                                top_p=1.0,
+                                               eos_token_id=period_token_id,
+                                               bad_words_ids=question_framing_ids
+                                               )
+
+
+                decoded = tokenizer.decode(generated[0, prompt.shape[-1]:],
+                                           skip_special_tokens=True)
+                if args.dataset_name == 'tqa' or args.dataset_name == 'triviaqa':
+                    # corner case.
+                    if 'Answer the question concisely' in decoded:
+                        print('#####error')
+                        print(decoded.split('Answer the question concisely')[1])
+                        print('#####error')
+                        decoded = decoded.split('Answer the question concisely')[0]
+                if args.dataset_name == 'coqa':
+                    if 'Q:' in decoded:
+                        print('#####error')
+                        print(decoded.split('Q:')[1])
+                        print('#####error')
+                        decoded = decoded.split('Q:')[0]
+                print(decoded)
+                answers[gen_iter] = decoded
+
+
+            print('sample: ', i)
+            if args.most_likely:
+                info = 'most_likely_'
+            else:
+                info = 'batch_generations_'
+            print("Saving answers")
+            np.save(f'./save_for_eval/{args.dataset_name}_hal_det_opt/answers/' + info + f'hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy',
+                    answers)
+    elif args.generate_gt:
+        from bleurt_pytorch import BleurtConfig, BleurtForSequenceClassification, BleurtTokenizer
+
+        model = BleurtForSequenceClassification.from_pretrained('./models/BLEURT-20').cuda()
+        tokenizer = BleurtTokenizer.from_pretrained('./models/BLEURT-20')
+        model.eval()
+
+        rouge = evaluate.load('rouge')
+        gts = np.zeros(0)
+        if args.dataset_name == 'tydiqa':
+            length = len(used_indices)
+        else:
+            length = len(dataset)
+        for i in range(length):
+            if args.dataset_name == 'tqa':
+                best_answer = dataset[i]['best_answer']
+                correct_answer = dataset[i]['correct_answers']
+                all_answers = [best_answer] + correct_answer
+            elif args.dataset_name == 'triviaqa':
+                all_answers = dataset[i]['answer']['aliases']
+            elif args.dataset_name == 'coqa':
+                all_answers = dataset[i]['answer']
+            elif args.dataset_name == 'tydiqa':
+                all_answers = dataset[int(used_indices[i])]['answers']['text']
+
+            if args.most_likely:
+                answers = np.load(
+                    f'./save_for_eval/{args.dataset_name}_hal_det_opt/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+            else:
+                answers = np.load(
+                    f'./save_for_eval/{args.dataset_name}_hal_det_opt/answers/batch_generations_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+            # get the gt.
+            if args.use_rouge:
+
+                predictions = answers
+                all_results = np.zeros((len(all_answers), len(predictions)))
+                all_results1 = np.zeros((len(all_answers), len(predictions)))
+                all_results2 = np.zeros((len(all_answers), len(predictions)))
+                for anw in range(len(all_answers)):
+                    results = rouge.compute(predictions=predictions,
+                                            references=[all_answers[anw]] * len(predictions),
+                                            use_aggregator=False)
+                    all_results[anw] = results['rougeL']
+                    all_results1[anw] = results['rouge1']
+                    all_results2[anw] = results['rouge2']
+
+                # breakpoint()
+                gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
+
+                if i % 50 == 0:
+                    print("samples passed: ", i)
+            else:
+
+                predictions = answers
+                all_results = np.zeros((len(all_answers), len(predictions)))
+                with torch.no_grad():
+                    for anw in range(len(all_answers)):
+                        inputs = tokenizer(predictions.tolist(), [all_answers[anw]] * len(predictions),
+                                           padding='longest', return_tensors='pt')
+                        for key in list(inputs.keys()):
+                            inputs[key] = inputs[key].cuda()
+                        res = np.asarray(model(**inputs).logits.flatten().tolist())
+                        all_results[anw] = res
+                gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
+                if i % 10 == 0:
+                    print("samples passed: ", i)
+        # breakpoint()
+        if args.most_likely:
+            if args.use_rouge:
+                np.save(f'./ml_{args.dataset_name}_rouge_score_opt.npy', gts)
+            else:
+                np.save(f'./ml_{args.dataset_name}_bleurt_score_opt.npy', gts)
+        else:
+            if args.use_rouge:
+                np.save(f'./bg_{args.dataset_name}_rouge_score_opt.npy', gts)
+            else:
+                np.save(f'./bg_{args.dataset_name}_bleurt_score_opt.npy', gts)
+
+    else:
+        model = AutoModelForCausalLM.from_pretrained(MODEL,
+                                                     torch_dtype=torch.float16).cuda()
+        tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
+        # firstly get the embeddings of the generated question and answers.
+        embed_generated = []
+
+        if args.dataset_name == 'tydiqa':
+            length = len(used_indices)
+        else:
+            length = len(dataset)
+        for i in tqdm(range(length)):
+            if args.dataset_name == 'tydiqa':
+                question = dataset[int(used_indices[i])]['question']
+            else:
+                question = dataset[i]['question']
+            answers = np.load(
+                f'save_for_eval/{args.dataset_name}_hal_det_opt/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+
+            for anw in answers:
+
+                if args.dataset_name == 'tydiqa':
+                    prompt = tokenizer(
+                        "Concisely answer the following question based on the information in the given passage: \n" + \
+                        " Passage: " + dataset[int(used_indices[i])]['context'] + " \n Q: " + question + " \n A:",
+                        return_tensors='pt').input_ids.cuda()
+                elif args.dataset_name == 'coqa':
+                    prompt = tokenizer(dataset[i]['prompt'] + anw, return_tensors='pt').input_ids.cuda()
+                else:
+                    prompt = tokenizer(
+                        f"Answer the question concisely. Q: {question}" + " A:" + anw,
+                        return_tensors='pt').input_ids.cuda()
+                with torch.no_grad():
+                    hidden_states = model(prompt, output_hidden_states=True).hidden_states
+                    hidden_states = torch.stack(hidden_states, dim=0).squeeze()
+                    hidden_states = hidden_states.detach().cpu().numpy()[:, -1, :]
+                    embed_generated.append(hidden_states)
+        embed_generated = np.asarray(np.stack(embed_generated), dtype=np.float32)
+        np.save(f'save_for_eval/{args.dataset_name}_hal_det_opt/most_likely_{args.model_name}_gene_embeddings_layer_wise.npy', embed_generated)
+
+        HEADS = [f"model.decoder.layers.{i}.self_attn.out_proj" for i in range(model.config.num_hidden_layers)]
+        MLPS = [f"model.decoder.layers.{i}.fc2" for i in range(model.config.num_hidden_layers)]
+        embed_generated_loc2 = []
+        embed_generated_loc1 = []
+        for i in tqdm(range(length)):
+            if args.dataset_name == 'tydiqa':
+                question = dataset[int(used_indices[i])]['question']
+            else:
+                question = dataset[i]['question']
+
+
+            answers = np.load(
+                f'save_for_eval/{args.dataset_name}_hal_det_opt/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
+            for anw in answers:
+                if args.dataset_name == 'tydiqa':
+                    prompt = tokenizer(
+                        "Concisely answer the following question based on the information in the given passage: \n" + \
+                        " Passage: " + dataset[int(used_indices[i])]['context'] + " \n Q: " + question + " \n A:",
+                        return_tensors='pt').input_ids.cuda()
+                elif args.dataset_name == 'coqa':
+                    prompt = tokenizer(dataset[i]['prompt'] + anw, return_tensors='pt').input_ids.cuda()
+                else:
+                    prompt = tokenizer(
+                        f"Answer the question concisely. Q: {question}" + " A:" + anw,
+                        return_tensors='pt').input_ids.cuda()
+
+                with torch.no_grad():
+                    with TraceDict(model, HEADS + MLPS) as ret:
+                        output = model(prompt, output_hidden_states=True)
+                    head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
+                    head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
+                    mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
+                    mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
+
+                    embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
+                    embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
+        embed_generated_loc2 = np.asarray(np.stack(embed_generated_loc2), dtype=np.float32)
+        embed_generated_loc1 = np.asarray(np.stack(embed_generated_loc1), dtype=np.float32)
+
+        np.save(f'save_for_eval/{args.dataset_name}_hal_det_opt/most_likely_{args.model_name}_gene_embeddings_head_wise.npy', embed_generated_loc1)
+        np.save(f'save_for_eval/{args.dataset_name}_hal_det_opt/most_likely_{args.model_name}_embeddings_mlp_wise.npy',  embed_generated_loc2)
+
+
+
+        # get the split and label (true or false) of the unlabeled data and the test data.
+        if args.use_rouge:
+            gts = np.load(f'./ml_{args.dataset_name}_rouge_score_opt.npy')
+            gts_bg = np.load(f'./bg_{args.dataset_name}_rouge_score_opt.npy')
+        else:
+            gts = np.load(f'./ml_{args.dataset_name}_bleurt_score_opt.npy')
+            gts_bg = np.load(f'./bg_{args.dataset_name}_bleurt_score_opt.npy')
+        thres = args.thres_gt
+        gt_label = np.asarray(gts> thres, dtype=np.int32)
+        gt_label_bg = np.asarray(gts_bg > thres, dtype=np.int32)
+
+
+        if args.dataset_name == 'tydiqa':
+            length = len(used_indices)
+        else:
+            length = len(dataset)
+
+
+        permuted_index = np.random.permutation(length)
+        wild_q_indices = permuted_index[:int(args.wild_ratio * length)]
+        # exclude validation samples.
+        wild_q_indices1 = wild_q_indices[:len(wild_q_indices) - 100]
+        wild_q_indices2 = wild_q_indices[len(wild_q_indices) - 100:]
+        gt_label_test = []
+        gt_label_wild = []
+        gt_label_val = []
+        for i in range(length):
+            if i not in wild_q_indices:
+                gt_label_test.extend(gt_label[i: i+1])
+            elif i in wild_q_indices1:
+                gt_label_wild.extend(gt_label[i: i+1])
+            else:
+                gt_label_val.extend(gt_label[i: i+1])
+        gt_label_test = np.asarray(gt_label_test)
+        gt_label_wild = np.asarray(gt_label_wild)
+        gt_label_val = np.asarray(gt_label_val)
+
+
+
+
+        def svd_embed_score(embed_generated_wild, gt_label, begin_k, k_span, mean=1, svd=1, weight=0):
+            embed_generated = embed_generated_wild
+            best_auroc_over_k = 0
+            best_layer_over_k = 0
+            best_scores_over_k = None
+            best_projection_over_k = None
+            for k in tqdm(range(begin_k, k_span)):
+                best_auroc = 0
+                best_layer = 0
+                best_scores = None
+                mean_recorded = None
+                best_projection = None
+                for layer in range(len(embed_generated_wild[0])):
+                    if mean:
+                        mean_recorded = embed_generated[:, layer, :].mean(0)
+                        centered = embed_generated[:, layer, :] - mean_recorded
+                    else:
+                        centered = embed_generated[:, layer, :]
+
+                    if not svd:
+                        pca_model = PCA(n_components=k, whiten=False).fit(centered)
+                        projection = pca_model.components_.T
+                        mean_recorded = pca_model.mean_
+                        if weight:
+                            projection = pca_model.singular_values_ * projection
+                    else:
+                        _, sin_value, V_p = torch.linalg.svd(torch.from_numpy(centered).cuda())
+                        projection = V_p[:k, :].T.cpu().data.numpy()
+                        if weight:
+                            projection = sin_value[:k] * projection
+
+
+                    scores = np.mean(np.matmul(centered, projection), -1, keepdims=True)
+                    assert scores.shape[1] == 1
+                    scores = np.sqrt(np.sum(np.square(scores), axis=1))
+
+                    # not sure about whether true and false data the direction will point to,
+                    # so we test both. similar practices are in the representation engineering paper
+                    # https://arxiv.org/abs/2310.01405
+                    measures1 = get_measures(scores[gt_label == 1],
+                                             scores[gt_label == 0], plot=False)
+                    measures2 = get_measures(-scores[gt_label == 1],
+                                             -scores[gt_label == 0], plot=False)
+
+                    if measures1[0] > measures2[0]:
+                        measures = measures1
+                        sign_layer = 1
+                    else:
+                        measures = measures2
+                        sign_layer = -1
+
+                    if measures[0] > best_auroc:
+                        best_auroc = measures[0]
+                        best_result = [100 * measures[2], 100 * measures[0]]
+                        best_layer = layer
+                        best_scores = sign_layer * scores
+                        best_projection = projection
+                        best_mean = mean_recorded
+                        best_sign = sign_layer
+                print('k: ', k, 'best result: ', best_result, 'layer: ', best_layer,
+                      'mean: ', mean, 'svd: ', svd)
+
+                if best_auroc > best_auroc_over_k:
+                    best_auroc_over_k = best_auroc
+                    best_result_over_k = best_result
+                    best_layer_over_k = best_layer
+                    best_k = k
+                    best_sign_over_k = best_sign
+                    best_scores_over_k = best_scores
+                    best_projection_over_k = best_projection
+                    best_mean_over_k = best_mean
+
+
+            return {'k': best_k,
+                    'best_layer':best_layer_over_k,
+                    'best_auroc':best_auroc_over_k,
+                    'best_result':best_result_over_k,
+                    'best_scores':best_scores_over_k,
+                    'best_mean': best_mean_over_k,
+                    'best_sign':best_sign_over_k,
+                    'best_projection':best_projection_over_k}
+
+
+        from sklearn.decomposition import PCA
+        feat_loc = args.feat_loc_svd
+
+
+
+        if args.most_likely:
+            if feat_loc == 3:
+                embed_generated = np.load(f'save_for_eval/{args.dataset_name}_hal_det_opt/most_likely_{args.model_name}_gene_embeddings_layer_wise.npy',
+                                  allow_pickle=True)
+            elif feat_loc == 2:
+                embed_generated = np.load(
+                    f'save_for_eval/{args.dataset_name}_hal_det_opt/most_likely_{args.model_name}_gene_embeddings_mlp_wise.npy',
+                    allow_pickle=True)
+            else:
+                embed_generated = np.load(
+                    f'save_for_eval/{args.dataset_name}_hal_det_opt/most_likely_{args.model_name}_gene_embeddings_head_wise.npy',
+                    allow_pickle=True)
+            feat_indices_wild = []
+            feat_indices_eval = []
+
+            if args.dataset_name == 'tydiqa':
+                length = len(used_indices)
+            else:
+                length = len(dataset)
+
+
+            for i in range(length):
+                if i in wild_q_indices1:
+                    feat_indices_wild.extend(np.arange(i, i+1).tolist())
+                elif i in wild_q_indices2:
+                    feat_indices_eval.extend(np.arange(i, i + 1).tolist())
+            if feat_loc == 3:
+                embed_generated_wild = embed_generated[feat_indices_wild][:,1:,:]
+                embed_generated_eval = embed_generated[feat_indices_eval][:, 1:, :]
+            else:
+                embed_generated_wild = embed_generated[feat_indices_wild]
+                embed_generated_eval = embed_generated[feat_indices_eval]
+
+
+
+
+
+        # returned_results = svd_embed_score(embed_generated_wild, gt_label_wild,
+        #                                    1, 11, mean=0, svd=0, weight=args.weighted_svd)
+        # get the best hyper-parameters on validation set
+        returned_results = svd_embed_score(embed_generated_eval, gt_label_val,
+                                           1, 11, mean=0, svd=0, weight=args.weighted_svd)
+
+        pca_model = PCA(n_components=returned_results['k'], whiten=False).fit(embed_generated_wild[:,returned_results['best_layer'],:])
+        projection = pca_model.components_.T
+        if args.weighted_svd:
+            projection = pca_model.singular_values_ * projection
+        scores = np.mean(np.matmul(embed_generated_wild[:,returned_results['best_layer'],:], projection), -1, keepdims=True)
+        assert scores.shape[1] == 1
+        best_scores = np.sqrt(np.sum(np.square(scores), axis=1)) * returned_results['best_sign']
+
+
+
+        # direct projection
+        feat_indices_test = []
+
+        for i in range(length):
+            if i not in wild_q_indices:
+                feat_indices_test.extend(np.arange(1 * i, 1 * i + 1).tolist())
+        if feat_loc == 3:
+            embed_generated_test = embed_generated[feat_indices_test][:, 1:, :]
+        else:
+            embed_generated_test = embed_generated[feat_indices_test]
+
+        test_scores = np.mean(np.matmul(embed_generated_test[:,returned_results['best_layer'],:],
+                                   projection), -1, keepdims=True)
+
+        assert test_scores.shape[1] == 1
+        test_scores = np.sqrt(np.sum(np.square(test_scores), axis=1))
+
+        measures = get_measures(returned_results['best_sign'] * test_scores[gt_label_test == 1],
+                                 returned_results['best_sign'] *test_scores[gt_label_test == 0], plot=False)
+        print_measures(measures[0], measures[1], measures[2], 'direct-projection')
+
+
+        thresholds = np.linspace(0,1, num=40)[1:-1]
+        normalizer = lambda x: x / (np.linalg.norm(x, ord=2, axis=-1, keepdims=True) + 1e-10)
+        auroc_over_thres = []
+        for thres_wild in thresholds:
+            best_auroc = 0
+            for layer in range(len(embed_generated_wild[0])):
+                thres_wild_score = np.sort(best_scores)[int(len(best_scores) * thres_wild)]
+                true_wild = embed_generated_wild[:,layer,:][best_scores > thres_wild_score]
+                false_wild = embed_generated_wild[:,layer,:][best_scores <= thres_wild_score]
+
+                embed_train = np.concatenate([true_wild,false_wild],0)
+                label_train = np.concatenate([np.ones(len(true_wild)),
+                                              np.zeros(len(false_wild))], 0)
+
+
+                ## gt training, saplma
+                # embed_train = embed_generated_wild[:,layer,:]
+                # label_train = gt_label_wild
+                ## gt training, saplma
+                from linear_probe import get_linear_acc
+
+
+
+                best_acc, final_acc, (
+                clf, best_state, best_preds, preds, labels_val), losses_train = get_linear_acc(
+                embed_train,
+                label_train,
+                embed_train,
+                label_train,
+                2, epochs = 50,
+                print_ret = True,
+                batch_size=512,
+                cosine=True,
+                nonlinear = True,
+                learning_rate = 0.05,
+                weight_decay = 0.0003)
+
+
+
+                clf.eval()
+                output = clf(torch.from_numpy(
+                    embed_generated_test[:, layer, :]).cuda())
+                pca_wild_score_binary_cls = torch.sigmoid(output)
+
+
+                pca_wild_score_binary_cls = pca_wild_score_binary_cls.cpu().data.numpy()
+
+                if np.isnan(pca_wild_score_binary_cls).sum() > 0:
+                    breakpoint()
+                measures = get_measures(pca_wild_score_binary_cls[gt_label_test == 1],
+                                        pca_wild_score_binary_cls[gt_label_test == 0], plot=False)
+
+                if measures[0] > best_auroc:
+                    best_auroc = measures[0]
+                    best_result = [100 * measures[0]]
+                    best_layer = layer
+
+            auroc_over_thres.append(best_auroc)
+            print('thres: ', thres_wild, 'best result: ', best_result, 'best_layer: ', best_layer)
+
+
+
+if __name__ == '__main__':
+    seed_everything(42)
+    main()
--- a/linear_probe.py
+++ b/linear_probe.py
@ -0,0 +1,314 @@
+from __future__ import print_function
+
+import os
+import sys
+import argparse
+import time
+import math
+
+import easydict
+import torch
+import torch.backends.cudnn as cudnn
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import copy
+
+from ylib.ytool import ArrayDataset
+cudnn.benchmark = True
+
+class LinearClassifier(nn.Module):
+    """Linear classifier"""
+    def __init__(self, feat_dim, num_classes=10):
+        super(LinearClassifier, self).__init__()
+        self.fc = nn.Linear(feat_dim, 1)
+
+    def forward(self, features):
+        return self.fc(features)
+
+class NonLinearClassifier(nn.Module):
+    """Linear classifier"""
+    def __init__(self, feat_dim, num_classes=10):
+        super(NonLinearClassifier, self).__init__()
+        self.fc1 = nn.Linear(feat_dim, 1024)
+        # self.fc2 = nn.Linear(1024, 512)
+        self.fc3 = nn.Linear(1024, 1)
+
+    def forward(self, features):
+        x = F.relu(self.fc1(features))
+        # x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+class NormedLinear(nn.Module):
+
+    def __init__(self, in_features, out_features, bn=False):
+        super(NormedLinear, self).__init__()
+        self.weight = nn.Parameter(torch.Tensor(in_features, out_features))
+        self.weight.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5)
+        self.bn = bn
+        if bn:
+            self.bn_layer = nn.BatchNorm1d(out_features)
+
+    def forward(self, x):
+        out = F.normalize(x, dim=1).mm(F.normalize(self.weight, dim=0))
+        if self.bn:
+            out = self.bn_layer(out)
+        return out
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].flatten().float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+def adjust_learning_rate(args, optimizer, epoch):
+    lr = args.learning_rate
+    if args.cosine:
+        eta_min = lr * (args.lr_decay_rate ** 3)
+        lr = eta_min + (lr - eta_min) * (
+                1 + math.cos(math.pi * epoch / args.epochs)) / 2
+    else:
+        steps = np.sum(epoch > np.asarray(args.lr_decay_epochs))
+        if steps > 0:
+            lr = lr * (args.lr_decay_rate ** steps)
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def warmup_learning_rate(args, epoch, batch_id, total_batches, optimizer):
+    if args.warm and epoch <= args.warm_epochs:
+        p = (batch_id + (epoch - 1) * total_batches) / \
+            (args.warm_epochs * total_batches)
+        lr = args.warmup_from + p * (args.warmup_to - args.warmup_from)
+
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+
+def set_optimizer(opt, model):
+    optimizer = optim.SGD(model.parameters(),
+                          lr=opt.learning_rate,
+                          momentum=opt.momentum,
+                          weight_decay=opt.weight_decay)
+    return optimizer
+
+try:
+    import apex
+    from apex import amp, optimizers
+except ImportError:
+    pass
+
+
+def train(train_loader, classifier, criterion, optimizer, epoch, print_freq=10):
+    """one epoch training"""
+    classifier.train()
+
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+
+    end = time.time()
+    for idx, (features, labels) in enumerate(train_loader):
+        data_time.update(time.time() - end)
+
+        features = features.cuda(non_blocking=True).float()
+        labels = labels.cuda(non_blocking=True).long()
+        bsz = labels.shape[0]
+        optimizer.zero_grad()
+        # warm-up learning rate
+        # warmup_learning_rate(opt, epoch, idx, len(train_loader), optimizer)
+
+        output = classifier(features)
+        loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.float())
+
+        # update metric
+        losses.update(loss.item(), bsz)
+        # acc1, acc5 = accuracy(output, labels, topk=(1, 5))
+        # breakpoint()
+        correct = (torch.sigmoid(output) > 0.5).long().view(-1).eq(labels.view(-1))
+
+        top1.update(correct.sum() / bsz, bsz)
+
+        # SGD
+
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        #
+        # # print info
+        if (idx + 1) % print_freq == 0:
+            print('Train: [{0}][{1}/{2}]\t'
+                  'BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'DT {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                  'loss {loss.val:.3f} ({loss.avg:.3f})\t'
+                  'Acc@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
+                   epoch, idx + 1, len(train_loader), batch_time=batch_time,
+                   data_time=data_time, loss=losses, top1=top1))
+            sys.stdout.flush()
+
+    return losses.avg, top1.avg
+
+
+def validate(val_loader, classifier, criterion, print_freq):
+    """validation"""
+    classifier.eval()
+
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+
+    preds = np.array([])
+
+    labels_out = np.array([])
+    with torch.no_grad():
+        end = time.time()
+        for idx, (features, labels) in enumerate(val_loader):
+            features = features.float().cuda()
+            labels_out = np.append(labels_out, labels)
+            labels = labels.long().cuda()
+            bsz = labels.shape[0]
+
+            # forward
+            # output = classifier(model.encoder(images))
+
+            output = classifier(features.detach())
+            loss = F.binary_cross_entropy_with_logits(output.view(-1), labels.float())
+            prob = torch.sigmoid(output)
+            conf = prob
+            pred = (prob>0.5).long().view(-1)
+            # conf, pred = prob.max(1)
+            preds = np.append(preds, conf.cpu().numpy())
+
+            # update metric
+            losses.update(loss.item(), bsz)
+            correct = (torch.sigmoid(output) > 0.5).long().view(-1).eq(labels.view(-1))
+            top1.update(correct.sum()/bsz, bsz)
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if (idx + 1) % 200 == 0:
+                print('Test: [{0}/{1}]\t'
+                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                  'Acc@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
+                   idx, len(val_loader), batch_time=batch_time,
+                   loss=losses, top1=top1))
+
+    # print(' * Acc@1 {top1.avg:.3f}'.format(top1=top1))
+    return losses.avg, top1.avg, preds, labels_out
+
+def get_linear_acc(ftrain, ltrain, ftest, ltest, n_cls, epochs=10,
+                   args=None, classifier=None,
+                   print_ret=True, normed=False, nonlinear=False,
+                   learning_rate=5,
+                   weight_decay=0,
+                   batch_size=512,
+                   cosine=False,
+                   lr_decay_epochs=[30,60,90]):
+
+    cluster2label = np.unique(ltrain)
+    label2cluster = {li: ci for ci, li in enumerate(cluster2label)}
+    ctrain = [label2cluster[l] for l in ltrain]
+    ctest = [label2cluster[l] for l in ltest]
+    # breakpoint()
+    opt = easydict.EasyDict({
+        "lr_decay_rate": 0.2,
+        "cosine": cosine,
+        "lr_decay_epochs": lr_decay_epochs,
+        "start_epoch": 0,
+        "learning_rate": learning_rate,
+        "epochs": epochs,
+        "print_freq": 200,
+        "batch_size": batch_size,
+        "momentum": 0.9,
+        "weight_decay": weight_decay,
+    })
+    if args is not None:
+        for k, v in args.items():
+            opt[k] = v
+
+    best_acc = 0
+
+    criterion = torch.nn.CrossEntropyLoss().cuda()
+    if classifier is None:
+        classifier = LinearClassifier(ftrain.shape[1], num_classes=n_cls).cuda()
+    if nonlinear:
+        classifier = NonLinearClassifier(ftrain.shape[1], num_classes=n_cls).cuda()
+
+    trainset = ArrayDataset(ftrain, labels=ctrain)
+    train_loader = torch.utils.data.DataLoader(trainset, batch_size=opt.batch_size, shuffle=True)
+
+    valset = ArrayDataset(ftest, labels=ctest)
+    val_loader = torch.utils.data.DataLoader(valset, batch_size=opt.batch_size, shuffle=False)
+
+    optimizer = set_optimizer(opt, classifier)
+
+    best_preds = None
+    best_state = None
+    # training routine
+    for epoch in range(opt.start_epoch + 1, opt.epochs + 1):
+        adjust_learning_rate(opt, optimizer, epoch)
+
+        # train for one epoch
+        loss_train, acc = train(train_loader, classifier, criterion, optimizer, epoch, print_freq=opt.print_freq)
+
+        # eval for one epoch
+        loss, val_acc, preds, labels_out = validate(val_loader, classifier, criterion, print_freq=opt.print_freq)
+        if val_acc > best_acc:
+            best_acc = val_acc
+            best_preds = preds
+            best_state = copy.deepcopy(classifier.state_dict())
+
+    return best_acc.item(), val_acc.item(), (classifier, best_state, best_preds, preds, labels_out), loss_train
+
+
+
+
+
+def save_model(model, acc, save_file):
+    print('==> Saving...')
+    torch.save({
+        'acc': acc,
+        'state_dict': model.state_dict(),
+    }, save_file)
+
--- a/llama_iti/init.py
+++ b/llama_iti/init.py
@ -0,0 +1,90 @@
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
+}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_llama"] = [
+        "LlamaForCausalLM",
+        "LlamaModel",
+        "LlamaPreTrainedModel",
+        "LlamaForSequenceClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
+
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_llama import LlamaTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_llama_fast import LlamaTokenizerFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/llama_iti/configuration_llama.py
+++ b/llama_iti/configuration_llama.py
@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaMA model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+
+
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
--- a/llama_iti/convert_llama_weights_to_hf.py
+++ b/llama_iti/convert_llama_weights_to_hf.py
@ -0,0 +1,318 @@
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import shutil
+import warnings
+
+import torch
+
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+
+
+try:
+    from transformers import LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    LlamaTokenizerFast = None
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+model = LlamaForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+NUM_SHARDS = {
+    "7B": 1,
+    "7Bf": 1,
+    "13B": 2,
+    "13Bf": 2,
+    "34B": 4,
+    "30B": 4,
+    "65B": 8,
+    "70B": 8,
+    "70Bf": 8,
+}
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
+    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
+    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
+        input_base_path = os.path.join(input_base_path, model_size)
+
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    if base > 10000.0:
+        max_position_embeddings = 16384
+    else:
+        max_position_embeddings = 2048
+
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    if tokenizer_path is not None:
+        tokenizer = tokenizer_class(tokenizer_path)
+        tokenizer.save_pretrained(model_path)
+    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+
+    if "n_kv_heads" in params:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
+        key_value_dim = dim // num_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    if num_shards == 1:
+        # Not sharded
+        # (The sharded implementation would also work, but this is simpler.)
+        loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
+    else:
+        # Sharded
+        loaded = [
+            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            for i in range(num_shards)
+        ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        if num_shards == 1:
+            # Unsharded
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wq.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    loaded[f"layers.{layer_i}.attention.wk.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
+            }
+        else:
+            # Sharded
+            # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+            # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+            # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
+            state_dict = {
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.attention_norm.weight"
+                ].clone(),
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ].clone(),
+            }
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(dim, dim)
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                            num_local_key_value_heads, dims_per_head, dim
+                        )
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(key_value_dim, dim),
+                num_key_value_heads,
+                key_value_dim,
+                dim,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(key_value_dim, dim)
+
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    if num_shards == 1:
+        # Unsharded
+        state_dict = {
+            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.norm.weight": loaded["norm.weight"],
+            "lm_head.weight": loaded["output.weight"],
+        }
+    else:
+        state_dict = {
+            "model.norm.weight": loaded[0]["norm.weight"],
+            "model.embed_tokens.weight": torch.cat(
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+            ),
+            "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+        }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
+    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
+    config = LlamaConfig(
+        hidden_size=dim,
+        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+    )
+    config.save_pretrained(tmp_model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+
+    print("Loading the checkpoint in a Llama model.")
+    model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    model.config.torch_dtype = torch.float16
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+    shutil.rmtree(tmp_model_path)
+
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    # Initialize the tokenizer based on the `spm` model
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
+    tokenizer.save_pretrained(tokenizer_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    if args.model_size != "tokenizer_only":
+        write_model(
+            model_path=args.output_dir,
+            input_base_path=args.input_dir,
+            model_size=args.model_size,
+            safe_serialization=args.safe_serialization,
+            tokenizer_path=spm_path,
+        )
+    else:
+        write_tokenizer(args.output_dir, spm_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/llama_iti/modeling_llama.py
+++ b/llama_iti/modeling_llama.py
--- a/llama_iti/tokenization_llama.py
+++ b/llama_iti/tokenization_llama.py
@ -0,0 +1,472 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for LLaMA."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.convert_slow_tokenizer import import_protobuf
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+
+if TYPE_CHECKING:
+    from transformers.tokenization_utils_base import TextInput
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "hf-internal-testing/llama-tokenizer": 2048,
+}
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class LlamaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Llama should be used.
+        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to add spaces between special tokens.
+        legacy (`bool`, *optional*):
+            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
+            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
+            example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
+
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        spaces_between_special_tokens=False,
+        legacy=None,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+
+        if legacy is None:
+            logger.warning_once(
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
+                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
+                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
+                " means, and thouroughly read the reason why this was added as explained in"
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
+            legacy = True
+
+        self.legacy = legacy
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            legacy=legacy,
+            **kwargs,
+        )
+
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if self.legacy or from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
+        """
+        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
+        first token is special.
+        """
+        if self.legacy or len(text) == 0:
+            return super().tokenize(text, **kwargs)
+
+        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
+
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        tokens = self.sp_model.encode(text, out_type=str)
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return tokens
+
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # since we manually add the prefix space, we have to remove it when decoding
+        if tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens[0] = tokens[0][1:]
+
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0 and self.legacy:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
+
+    @property
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+
+        The output should look something like:
+
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+
+        The reference for this chat template is [this code
+        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
+        in the original repository.
+        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+
+        return template
--- a/llama_iti/tokenization_llama_fast.py
+++ b/llama_iti/tokenization_llama_fast.py
@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from tokenizers import processors
+
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import is_sentencepiece_available, logging
+from transformers.utils.versions import require_version
+
+
+require_version("tokenizers>=0.13.3")
+
+if is_sentencepiece_available():
+    from .tokenization_llama import LlamaTokenizer
+else:
+    LlamaTokenizer = None
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+    },
+    "tokenizer_file": {
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+    },
+}
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class LlamaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This uses notably ByteFallback and no normalization.
+
+    ```python
+    >>> from transformers import LlamaTokenizerFast
+
+    >>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
+    >>> tokenizer.encode("Hello this is a test")
+    [1, 15043, 445, 338, 263, 1243]
+    ```
+
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        tokenizer_file (`str`, *optional*):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Llama should be used.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    slow_tokenizer_class = LlamaTokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        use_default_system_prompt=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.use_default_system_prompt = use_default_system_prompt
+        self.vocab_file = vocab_file
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    @property
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
+    def default_chat_template(self):
+        """
+        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
+        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
+        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
+        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
+        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
+        to fine-tune a model with more flexible role ordering!
+
+        The output should look something like:
+
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] Prompt [/INST]
+
+        The reference for this chat template is [this code
+        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
+        in the original repository.
+        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
+        template = (
+            "{% if messages[0]['role'] == 'system' %}"
+            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
+            "{% set loop_messages = messages %}"  # Or use the default system message if the flag is set
+            "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
+            "{% else %}"
+            "{% set loop_messages = messages %}"
+            "{% set system_message = false %}"
+            "{% endif %}"
+            "{% for message in loop_messages %}"  # Loop over all non-system messages
+            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+            "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+            "{% endif %}"
+            "{% if loop.index0 == 0 and system_message != false %}"  # Embed system message in first message
+            "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
+            "{% else %}"
+            "{% set content = message['content'] %}"
+            "{% endif %}"
+            "{% if message['role'] == 'user' %}"  # After all of that, handle messages/roles in a fairly normal way
+            "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
+            "{% elif message['role'] == 'system' %}"
+            "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ ' '  + content.strip() + ' ' + eos_token }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
+        default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
+        template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
+
+        return template
+
+    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
--- a/metric_utils.py
+++ b/metric_utils.py
@ -0,0 +1,218 @@
+import numpy as np
+import sklearn.metrics as sk
+
+recall_level_default = 0.95
+
+
+def stable_cumsum(arr, rtol=1e-05, atol=1e-08):
+    """Use high precision for cumsum and check that final value matches sum
+    Parameters
+    ----------
+    arr : array-like
+        To be cumulatively summed as flat
+    rtol : float
+        Relative tolerance, see ``np.allclose``
+    atol : float
+        Absolute tolerance, see ``np.allclose``
+    """
+    out = np.cumsum(arr, dtype=np.float64)
+    expected = np.sum(arr, dtype=np.float64)
+    if not np.allclose(out[-1], expected, rtol=rtol, atol=atol):
+        raise RuntimeError('cumsum was found to be unstable: '
+                           'its last element does not correspond to sum')
+    return out
+
+
+def fpr_and_fdr_at_recall(y_true, y_score, recall_level=recall_level_default,
+                          pos_label=None, return_index=False):
+
+    classes = np.unique(y_true)
+    if (pos_label is None and
+            not (np.array_equal(classes, [0, 1]) or
+                     np.array_equal(classes, [-1, 1]) or
+                     np.array_equal(classes, [0]) or
+                     np.array_equal(classes, [-1]) or
+                     np.array_equal(classes, [1]))):
+        raise ValueError("Data is not binary and pos_label is not specified")
+    elif pos_label is None:
+        pos_label = 1.
+
+    # make y_true a boolean vector
+    y_true = (y_true == pos_label)
+
+    # sort scores and corresponding truth values
+    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
+    y_score = y_score[desc_score_indices]
+    y_true = y_true[desc_score_indices]
+
+    # y_score typically has many tied values. Here we extract
+    # the indices associated with the distinct values. We also
+    # concatenate a value for the end of the curve.
+    distinct_value_indices = np.where(np.diff(y_score))[0]
+    # import ipdb;
+    # ipdb.set_trace()
+    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
+
+    # accumulate the true positives with decreasing threshold
+    tps = stable_cumsum(y_true)[threshold_idxs]
+    fps = 1 + threshold_idxs - tps      # add one because of zero-based indexing
+
+    thresholds = y_score[threshold_idxs]
+
+    recall = tps / tps[-1]
+    recall_fps = fps / fps[-1]
+    # breakpoint()
+    ## additional code for calculating.
+    if return_index:
+        recall_level_fps = 1 - recall_level_default
+        index_for_tps = threshold_idxs[np.argmin(np.abs(recall - recall_level))]
+        index_for_fps = threshold_idxs[np.argmin(np.abs(recall_fps - recall_level_fps))]
+        index_for_id_initial = []
+        index_for_ood_initial = []
+        for index in range(index_for_fps, index_for_tps + 1):
+            if y_true[index] == 1:
+                index_for_id_initial.append(desc_score_indices[index])
+            else:
+                index_for_ood_initial.append(desc_score_indices[index])
+    # import ipdb;
+    # ipdb.set_trace()
+    ##
+    last_ind = tps.searchsorted(tps[-1])
+    sl = slice(last_ind, None, -1)  # [last_ind::-1]
+    recall, fps, tps, thresholds = np.r_[recall[sl], 1], np.r_[fps[sl], 0], np.r_[tps[sl], 0], thresholds[sl]
+
+    cutoff = np.argmin(np.abs(recall - recall_level))
+
+    # 8.868, ours
+    # 5.772, vanilla
+    # 5.478, vanilla 18000
+    # 6.018, oe
+    # 102707,
+    # 632
+    # 5992
+    # breakpoint()
+    if return_index:
+        return fps[cutoff] / (np.sum(np.logical_not(y_true))), index_for_id_initial, index_for_ood_initial
+    else:
+        return fps[cutoff] / (np.sum(np.logical_not(y_true)))
+    # , fps[cutoff]/(fps[cutoff] + tps[cutoff])
+
+
+def get_measures(_pos, _neg, recall_level=recall_level_default, return_index=False, plot=False):
+    pos = np.array(_pos[:]).reshape((-1, 1))
+    neg = np.array(_neg[:]).reshape((-1, 1))
+    examples = np.squeeze(np.vstack((pos, neg)))
+    labels = np.zeros(len(examples), dtype=np.int32)
+    labels[:len(pos)] += 1
+
+    auroc = sk.roc_auc_score(labels, examples)
+    if plot:
+        # breakpoint()
+        import matplotlib.pyplot as plt
+        fpr1, tpr1, thresholds = sk.roc_curve(labels, examples, pos_label=1)
+        fig, ax = plt.subplots(figsize=(10, 8))
+        ax.plot(fpr1, tpr1, linewidth=2,
+                         label='10000_1')
+        ax.plot([0, 1], [0, 1], linestyle='--', color='grey')
+        plt.legend(fontsize=12)
+        plt.savefig('10000_1.jpg', dpi=250)
+    aupr = sk.average_precision_score(labels, examples)
+    if return_index:
+        fpr, index_id, index_ood = fpr_and_fdr_at_recall(labels, examples, recall_level, return_index=return_index)
+        return auroc, aupr, fpr, index_id, index_ood
+    else:
+        fpr= fpr_and_fdr_at_recall(labels, examples, recall_level)
+        return auroc, aupr, fpr
+
+def get_measures_entangled(_pos, _neg, _pos1, _neg1,
+                           recall_level=recall_level_default, return_index=False, plot=False):
+    pos = np.array(_pos[:]).reshape((-1, 1))
+    neg = np.array(_neg[:]).reshape((-1, 1))
+    examples = np.squeeze(np.vstack((pos, neg)))
+    labels = np.zeros(len(examples), dtype=np.int32)
+    labels[:len(pos)] += 1
+
+    pos1 = np.array(_pos1[:]).reshape((-1, 1))
+    neg1 = np.array(_neg1[:]).reshape((-1, 1))
+    examples1 = np.squeeze(np.vstack((pos1, neg1)))
+    labels1 = np.zeros(len(examples1), dtype=np.int32)
+    labels1[:len(pos1)] += 1
+
+
+    auroc = sk.roc_auc_score(labels, examples)
+    if plot:
+        # breakpoint()
+        import matplotlib.pyplot as plt
+        fpr1, tpr1, thresholds = sk.roc_curve(labels, examples, pos_label=1)
+        fpr2, tpr2, thresholds1 = sk.roc_curve(labels1, examples1, pos_label=1)
+        fig, ax = plt.subplots(figsize=(10, 8))
+        ax.plot(fpr1, tpr1, linewidth=2,
+                label='One layer')
+        ax.plot(fpr2, tpr2, linewidth=2,
+                         label='Two layer')
+        ax.plot([0, 1], [0, 1], linestyle='--', color='grey')
+        plt.legend(fontsize=12)
+        plt.savefig('one_layer.jpg', dpi=250)
+    aupr = sk.average_precision_score(labels, examples)
+    if return_index:
+        fpr, index_id, index_ood = fpr_and_fdr_at_recall(labels, examples, recall_level, return_index=return_index)
+        return auroc, aupr, fpr, index_id, index_ood
+    else:
+        fpr= fpr_and_fdr_at_recall(labels, examples, recall_level)
+        return auroc, aupr, fpr
+
+
+def show_performance(pos, neg, method_name='Ours', recall_level=recall_level_default):
+    '''
+    :param pos: 1's class, class to detect, outliers, or wrongly predicted
+    example scores
+    :param neg: 0's class scores
+    '''
+
+    auroc, aupr, fpr = get_measures(pos[:], neg[:], recall_level)
+
+    print('\t\t\t' + method_name)
+    print('FPR{:d}:\t\t\t{:.2f}'.format(int(100 * recall_level), 100 * fpr))
+    print('AUROC:\t\t\t{:.2f}'.format(100 * auroc))
+    print('AUPR:\t\t\t{:.2f}'.format(100 * aupr))
+    # print('FDR{:d}:\t\t\t{:.2f}'.format(int(100 * recall_level), 100 * fdr))
+
+
+def print_measures(auroc, aupr, fpr, method_name='Ours', recall_level=recall_level_default):
+    print('\t\t\t\t' + method_name)
+    print('  FPR{:d} AUROC AUPR'.format(int(100*recall_level)))
+    print('& {:.2f} & {:.2f} & {:.2f}'.format(100*fpr, 100*auroc, 100*aupr))
+    #print('FPR{:d}:\t\t\t{:.2f}'.format(int(100 * recall_level), 100 * fpr))
+    #print('AUROC: \t\t\t{:.2f}'.format(100 * auroc))
+    #print('AUPR:  \t\t\t{:.2f}'.format(100 * aupr))
+
+
+def print_measures_with_std(aurocs, auprs, fprs, method_name='Ours', recall_level=recall_level_default):
+    print('\t\t\t\t' + method_name)
+    print('  FPR{:d} AUROC AUPR'.format(int(100*recall_level)))
+    print('& {:.2f} & {:.2f} & {:.2f}'.format(100*np.mean(fprs), 100*np.mean(aurocs), 100*np.mean(auprs)))
+    print('& {:.2f} & {:.2f} & {:.2f}'.format(100*np.std(fprs), 100*np.std(aurocs), 100*np.std(auprs)))
+    #print('FPR{:d}:\t\t\t{:.2f}\t+/- {:.2f}'.format(int(100 * recall_level), 100 * np.mean(fprs), 100 * np.std(fprs)))
+    #print('AUROC: \t\t\t{:.2f}\t+/- {:.2f}'.format(100 * np.mean(aurocs), 100 * np.std(aurocs)))
+    #print('AUPR:  \t\t\t{:.2f}\t+/- {:.2f}'.format(100 * np.mean(auprs), 100 * np.std(auprs)))
+
+
+def show_performance_comparison(pos_base, neg_base, pos_ours, neg_ours, baseline_name='Baseline',
+                                method_name='Ours', recall_level=recall_level_default):
+    '''
+    :param pos_base: 1's class, class to detect, outliers, or wrongly predicted
+    example scores from the baseline
+    :param neg_base: 0's class scores generated by the baseline
+    '''
+    auroc_base, aupr_base, fpr_base = get_measures(pos_base[:], neg_base[:], recall_level)
+    auroc_ours, aupr_ours, fpr_ours = get_measures(pos_ours[:], neg_ours[:], recall_level)
+
+    print('\t\t\t' + baseline_name + '\t' + method_name)
+    print('FPR{:d}:\t\t\t{:.2f}\t\t{:.2f}'.format(
+        int(100 * recall_level), 100 * fpr_base, 100 * fpr_ours))
+    print('AUROC:\t\t\t{:.2f}\t\t{:.2f}'.format(
+        100 * auroc_base, 100 * auroc_ours))
+    print('AUPR:\t\t\t{:.2f}\t\t{:.2f}'.format(
+        100 * aupr_base, 100 * aupr_ours))
+    # print('FDR{:d}:\t\t\t{:.2f}\t\t{:.2f}'.format(
+    #     int(100 * recall_level), 100 * fdr_base, 100 * fdr_ours))
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,899 @@
+import os
+import sys
+sys.path.insert(0, "TruthfulQA")
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import llama_iti
+from datasets import load_dataset
+from tqdm import tqdm
+import numpy as np
+import llama_iti
+import pandas as pd
+import warnings
+from einops import rearrange
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from baukit import Trace, TraceDict
+import sklearn
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from sklearn.linear_model import LogisticRegression
+import pickle
+from functools import partial
+
+from truthfulqa import utilities, models, metrics
+import openai
+from truthfulqa.configs import BEST_COL, ANSWER_COL, INCORRECT_COL
+import copy
+
+ENGINE_MAP = {
+    'llama_7B': 'baffo32/decapoda-research-llama-7B-hf', 
+    'alpaca_7B': 'circulus/alpaca-7b', 
+    'vicuna_7B': 'AlekseyKorshuk/vicuna-7b', 
+    'llama2_chat_7B': 'meta-llama/Llama-2-7b-chat-hf', 
+    'llama2_chat_13B': 'meta-llama/Llama-2-13b-chat-hf', 
+    'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf', 
+}
+
+from truthfulqa.utilities import (
+    format_prompt,
+    format_prompt_with_answer_strings,
+    split_multi_answer,
+    format_best,
+    find_start,
+)
+from truthfulqa.presets import preset_map, COMPARE_PRIMER
+from truthfulqa.models import find_subsequence, set_columns, MC_calcs
+from truthfulqa.evaluate import format_frame, data_to_dict
+
+
+############# CCS #############
+class MLPProbe(nn.Module):
+    def __init__(self, d):
+        super().__init__()
+        self.linear1 = nn.Linear(d, 100)
+        self.linear2 = nn.Linear(100, 1)
+
+    def forward(self, x):
+        h = F.relu(self.linear1(x))
+        o = self.linear2(h)
+        return torch.sigmoid(o)
+
+
+class CCS(object):
+    def __init__(self, x0, x1, nepochs=1000, ntries=10, lr=1e-3, batch_size=-1,
+                 verbose=False, device="cuda", linear=True, weight_decay=0.01, var_normalize=False):
+        # data
+        self.var_normalize = var_normalize
+        self.x0 = self.normalize(x0)
+        self.x1 = self.normalize(x1)
+        self.d = self.x0.shape[-1]
+
+        # training
+        self.nepochs = nepochs
+        self.ntries = ntries
+        self.lr = lr
+        self.verbose = verbose
+        self.device = device
+        self.batch_size = batch_size
+        self.weight_decay = weight_decay
+
+        # probe
+        self.linear = linear
+        self.probe = self.initialize_probe()
+        self.best_probe = copy.deepcopy(self.probe)
+
+    def initialize_probe(self):
+        if self.linear:
+            self.probe = nn.Sequential(nn.Linear(self.d, 1), nn.Sigmoid())
+        else:
+            self.probe = MLPProbe(self.d)
+        self.probe.to(self.device)
+
+    def normalize(self, x):
+        """
+        Mean-normalizes the data x (of shape (n, d))
+        If self.var_normalize, also divides by the standard deviation
+        """
+        normalized_x = x - x.mean(axis=0, keepdims=True)
+        if self.var_normalize:
+            normalized_x /= normalized_x.std(axis=0, keepdims=True)
+
+        return normalized_x
+
+    def get_tensor_data(self):
+        """
+        Returns x0, x1 as appropriate tensors (rather than np arrays)
+        """
+        x0 = torch.tensor(self.x0, dtype=torch.float, requires_grad=False, device=self.device)
+        x1 = torch.tensor(self.x1, dtype=torch.float, requires_grad=False, device=self.device)
+        return x0, x1
+
+    def get_loss(self, p0, p1):
+        """
+        Returns the CCS loss for two probabilities each of shape (n,1) or (n,)
+        """
+        informative_loss = (torch.min(p0, p1) ** 2).mean(0)
+        consistent_loss = ((p0 - (1 - p1)) ** 2).mean(0)
+        return informative_loss + consistent_loss
+
+    def get_acc(self, x0_test, x1_test, y_test, return_conf=False):
+        """
+        Computes accuracy for the current parameters on the given test inputs
+        """
+        x0 = torch.tensor(self.normalize(x0_test), dtype=torch.float, requires_grad=False, device=self.device)
+        x1 = torch.tensor(self.normalize(x1_test), dtype=torch.float, requires_grad=False, device=self.device)
+        with torch.no_grad():
+            p0, p1 = self.best_probe(x0), self.best_probe(x1)
+        avg_confidence = 0.5 * (p0 + (1 - p1))
+        predictions = (avg_confidence.detach().cpu().numpy() < 0.5).astype(int)[:, 0]
+        # breakpoint()
+        acc = np.asarray((predictions == y_test), dtype=np.int32).mean()
+        acc = max(acc, 1 - acc)
+
+        if return_conf:
+            return avg_confidence
+        else:
+            return acc
+
+    def train(self):
+        """
+        Does a single training run of nepochs epochs
+        """
+        x0, x1 = self.get_tensor_data()
+        permutation = torch.randperm(len(x0))
+        x0, x1 = x0[permutation], x1[permutation]
+
+        # set up optimizer
+        optimizer = torch.optim.AdamW(self.probe.parameters(), lr=self.lr, weight_decay=self.weight_decay)
+
+        batch_size = len(x0) if self.batch_size == -1 else self.batch_size
+        nbatches = len(x0) // batch_size
+
+        # Start training (full batch)
+        for epoch in range(self.nepochs):
+            # breakpoint()
+            for j in range(nbatches):
+                x0_batch = x0[j * batch_size:(j + 1) * batch_size]
+                x1_batch = x1[j * batch_size:(j + 1) * batch_size]
+
+                # probe
+                p0, p1 = self.probe(x0_batch), self.probe(x1_batch)
+
+                # get the corresponding loss
+                loss = self.get_loss(p0, p1)
+
+                # update the parameters
+                optimizer.zero_grad()
+                loss.backward()
+                # print(loss.item())
+                optimizer.step()
+
+        return loss.detach().cpu().item()
+
+    def repeated_train(self):
+        best_loss = np.inf
+        for train_num in range(self.ntries):
+            self.initialize_probe()
+            loss = self.train()
+            if loss < best_loss:
+                self.best_probe = copy.deepcopy(self.probe)
+                best_loss = loss
+
+        return best_loss
+def load_nq():
+    dataset = load_dataset("OamPatel/iti_nq_open_val")["validation"]
+    df = pd.DataFrame(columns=["question", "answer", "false_answer"])
+    for row in dataset:
+        new_row = pd.DataFrame({"question": [row["question"]], "answer": [[_ for _ in row["answer"]]], "false_answer": [row["false_answer"]]})
+        df = pd.concat([df, new_row], ignore_index=True)
+    return df
+
+def load_triviaqa():
+    dataset = load_dataset("OamPatel/iti_trivia_qa_val")["validation"]
+    df = pd.DataFrame(columns=["question", "answer", "false_answer"])
+    for row in dataset:
+        new_row = pd.DataFrame({"question": [row["question"]], "answer": [[_ for _ in row["answer"]['aliases']]], "false_answer": [row["false_answer"]]})
+        df = pd.concat([df, new_row], ignore_index=True)
+    return df
+
+def format_truthfulqa(question, choice, args):
+    if args.q_only:
+        return f"Q: {question}"
+    elif args.append_same_token:
+        return f"Q: {question} A: {choice}." + " The answer to the question is right"
+    else:
+        return f"Q: {question} A: {choice}"
+
+def format_truthfulqa_end_q(question, choice, rand_question, args):
+    return f"Q: {question} A: {choice} Q: {rand_question}"
+
+
+def tokenized_tqa(dataset, tokenizer, args):
+
+    all_prompts = []
+    all_labels = []
+    for i in range(len(dataset)):
+        question = dataset[i]['question']
+        choices = dataset[i]['mc2_targets']['choices']
+        labels = dataset[i]['mc2_targets']['labels']
+
+        assert len(choices) == len(labels), (len(choices), len(labels))
+
+        for j in range(len(choices)): 
+            choice = choices[j]
+            label = labels[j]
+            prompt = format_truthfulqa(question, choice, args)
+            if i == 0 and j == 0: 
+                print(prompt)
+            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
+            all_prompts.append(prompt)
+            all_labels.append(label)
+    
+    return all_prompts, all_labels
+
+def tokenized_tqa_gen_end_q(dataset, tokenizer, args):
+
+    all_prompts = []
+    all_labels = []
+    all_categories = []
+    for i in range(len(dataset)): 
+        question = dataset[i]['question']
+        category = dataset[i]['category']
+        rand_idx = np.random.randint(len(dataset))
+        rand_question = dataset[rand_idx]['question']
+
+        for j in range(len(dataset[i]['correct_answers'])): 
+            answer = dataset[i]['correct_answers'][j]
+            # breakpoint()
+            prompt = format_truthfulqa_end_q(question, answer, rand_question, args)
+            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
+            all_prompts.append(prompt)
+            all_labels.append(1)
+            all_categories.append(category)
+        
+        for j in range(len(dataset[i]['incorrect_answers'])):
+            answer = dataset[i]['incorrect_answers'][j]
+            prompt = format_truthfulqa_end_q(question, answer, rand_question, args)
+            # breakpoint()
+            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
+            all_prompts.append(prompt)
+            all_labels.append(0)
+            all_categories.append(category)
+
+    return all_prompts, all_labels, all_categories
+
+def tokenized_tqa_gen(dataset, tokenizer, args):
+
+    all_prompts = []
+    all_labels = []
+    all_categories = []
+    all_answer_length = []
+    for i in range(len(dataset)): 
+        question = dataset[i]['question']
+        category = dataset[i]['category']
+
+
+        for j in range(len(dataset[i]['correct_answers'])): 
+            answer = dataset[i]['correct_answers'][j]
+            prompt = format_truthfulqa(question, answer, args)
+            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
+            if args.average:
+                all_answer_length.append(len(tokenizer(f"{answer}", return_tensors = 'pt').input_ids[0]) - 1)
+                # print(tokenizer(f"{answer}", return_tensors = 'pt').input_ids)
+                # print(prompt)
+            # breakpoint()
+            all_prompts.append(prompt)
+            all_labels.append(1)
+            all_categories.append(category)
+        
+        for j in range(len(dataset[i]['incorrect_answers'])):
+            answer = dataset[i]['incorrect_answers'][j]
+            prompt = format_truthfulqa(question, answer, args)
+            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
+            if args.average:
+                all_answer_length.append(len(tokenizer(f"{answer}", return_tensors = 'pt').input_ids[0]) - 1)
+                # print(tokenizer(f"{answer}", return_tensors='pt').input_ids)
+                # print(prompt)
+            all_prompts.append(prompt)
+            all_labels.append(0)
+            all_categories.append(category)
+        # breakpoint()
+    return all_prompts, all_labels, all_categories, all_answer_length
+
+
+def get_llama_activations_bau(model, prompt, device): 
+
+    HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)]
+    MLPS = [f"model.layers.{i}.mlp" for i in range(model.config.num_hidden_layers)]
+
+    with torch.no_grad():
+        prompt = prompt.to(device)
+        with TraceDict(model, HEADS+MLPS) as ret:
+            output = model(prompt, output_hidden_states = True)
+        hidden_states = output.hidden_states
+        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
+        hidden_states = hidden_states.detach().cpu().numpy()
+        head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
+        # breakpoint()
+        head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim = 0).squeeze().numpy()
+        mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
+        mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim = 0).squeeze().numpy()
+
+    return hidden_states, head_wise_hidden_states, mlp_wise_hidden_states
+
+
+def get_llama_logits(model, prompt, device): 
+
+    model.eval()
+    with torch.no_grad(): 
+        prompt = prompt.to(device)
+        logits = model(prompt).logits
+        logits = logits.detach().cpu()
+        return logits
+
+def save_probes(probes, path): 
+    """takes in a list of sklearn lr probes and saves them to path"""
+    with open(path, 'wb') as f: 
+        pickle.dump(probes, f)
+
+def load_probes(path): 
+    """loads a list of sklearn lr probes from path"""
+    with open(path, 'rb') as f: 
+        probes = pickle.load(f)
+    return probes
+
+# -- TruthfulQA helper functions -- # 
+
+def tqa_run_answers(frame, engine, tag, preset, model=None, tokenizer=None, verbose=True, device=None, cache_dir=None, interventions={}, intervention_fn=None, instruction_prompt=True, many_shot_prefix=None):
+
+    """Stores answers from autoregressive HF models (GPT-2, GPT-Neo)"""
+
+    if tag not in frame.columns:
+        frame[tag] = ''
+
+    frame[tag].fillna('', inplace=True)
+    frame[tag] = frame[tag].astype(str)
+
+    # get tokens for ending sequence
+    seq_start = np.array(tokenizer('A:')['input_ids'])
+    seq_end = np.array(tokenizer('Q:')['input_ids'])
+
+    tokens = []
+    for idx in frame.index: 
+        if pd.isnull(frame.loc[idx, tag]) or not len(frame.loc[idx, tag]):
+            prompt = format_prompt(frame.loc[idx], preset, format='general')
+            prefix = ''
+            if instruction_prompt:  # from Ouyang et al. (2022) Figure 17, followed by LLaMA evaluation, and then followed by us
+                prefix += 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n'
+            if many_shot_prefix is not None:
+                prefix += many_shot_prefix + '\n\n'
+            prompt = prefix + prompt            
+            input_ids = tokenizer(prompt, return_tensors='pt').input_ids
+            tokens.append(input_ids)
+
+    # --- intervention code --- #
+    def id(head_output, layer_name): 
+        return head_output
+
+    if interventions == {}: 
+        intervene = id
+        layers_to_intervene = []
+    else: 
+        intervene = partial(intervention_fn, start_edit_location='lt')
+        layers_to_intervene = list(interventions.keys())
+    # --- intervention code --- #
+
+    sequences = []
+    with torch.no_grad():
+        for idx, input_ids in enumerate(tqdm(tokens)):
+            max_len = input_ids.shape[-1] + 50
+
+            # --- intervention code --- #
+
+            with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: 
+                input_ids = input_ids.to(device)
+                model_gen_tokens = model.generate(input_ids, top_k=1, max_length=max_len, num_return_sequences=1,)[:, input_ids.shape[-1]:]
+            
+            model_gen_str = tokenizer.decode(model_gen_tokens[0], skip_special_tokens=True)
+            model_gen_str = model_gen_str.strip()
+
+            try: 
+                # remove everything after 'Q:'
+                model_gen_str = model_gen_str.split("Q:")[0].strip()
+                # keep everything after A: 
+                model_gen_str = model_gen_str.split("A:")[1].strip()
+            except: 
+                pass
+
+            if verbose: 
+                print("MODEL_OUTPUT: ", model_gen_str)
+            
+            frame.loc[idx, tag] = model_gen_str
+            sequences.append(model_gen_str)
+
+            # --- intervention code --- #
+
+    if device:
+        torch.cuda.empty_cache()
+
+    return frame
+
+def tqa_run_probs(frame, engine, tag, preset, model=None, tokenizer=None, verbose=True, device=None, cache_dir=None, interventions={}, intervention_fn=None, instruction_prompt=True, many_shot_prefix=None):
+
+    """Runs multiple-choice metrics for autoregressive HuggingFace models (GPT-2, GPT-Neo)"""
+
+    set_columns(tag, frame)
+
+    if model is None:
+        model = AutoModelForCausalLM.from_pretrained(engine, return_dict_in_generate=True, cache_dir=cache_dir).to(device)
+        model.eval()
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(engine, cache_dir=cache_dir)
+
+    with torch.no_grad():
+        for idx in tqdm(frame.index):
+            if pd.isnull(frame.loc[idx, '{0} lprob max'.format(tag)]):
+
+                # check that answer exists
+                if pd.isnull(frame.loc[idx, INCORRECT_COL]):
+                    warnings.warn("References missing for {0}!".format(idx), stacklevel=2)
+                    continue
+                if not len(frame.loc[idx, INCORRECT_COL]):
+                    warnings.warn("References missing for {0}!".format(idx), stacklevel=2)
+                    continue
+
+                # reference answers
+                ref_best = format_best(frame.loc[idx, BEST_COL])
+                ref_true = split_multi_answer(frame.loc[idx, ANSWER_COL])
+                ref_false = split_multi_answer(frame.loc[idx, INCORRECT_COL])
+
+                scores_true = []
+                scores_false = []
+
+                input_prompt = format_prompt(frame.loc[idx], preset, format='general')
+                if many_shot_prefix is not None:
+                    input_prompt = many_shot_prefix + input_prompt
+                if instruction_prompt:
+                    input_prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + input_prompt
+                
+                # --- intervention code --- #
+                def id(head_output, layer_name): 
+                    return head_output
+
+                if interventions == {}: 
+                    layers_to_intervene = []
+                else: 
+                    layers_to_intervene = list(interventions.keys())
+                # --- intervention code --- #
+
+                for temp_ans in ref_true:
+                    # append the current answer choice to the prompt
+                    prompt = format_prompt_with_answer_strings(frame.loc[idx, 'Question'],
+                                                               temp_ans,
+                                                               preset,
+                                                               format='general')
+                    if many_shot_prefix is not None:
+                        prompt = many_shot_prefix + prompt
+                    if instruction_prompt:
+                        prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + prompt
+                    
+                    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device)
+                    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+                    start_edit_location = input_ids.shape[-1] + 4 # account for the "lnA: " which is 4 tokens. Don't have to worry about BOS token because already in prompt
+
+                    if interventions == {}: 
+                        intervene = id
+                    else: 
+                        intervene = partial(intervention_fn, start_edit_location=start_edit_location)
+                    
+                    with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: 
+                        outputs = model(prompt_ids)[0].squeeze(0)
+                    
+                    outputs = outputs.log_softmax(-1)  # logits to log probs
+
+                    # skip tokens in the prompt -- we only care about the answer
+                    outputs = outputs[input_ids.shape[-1] - 1: -1, :]
+                    prompt_ids = prompt_ids[0, input_ids.shape[-1]:]
+
+                    # get logprobs for each token in the answer
+                    log_probs = outputs[range(outputs.shape[0]), prompt_ids.squeeze(0)]
+                    log_probs = log_probs[3:]  # drop the '\nA:' prefix 
+
+                    scores_true.append(log_probs.sum().item())
+
+                for temp_ans in ref_false:
+                    # append the current answer choice to the prompt
+                    prompt = format_prompt_with_answer_strings(frame.loc[idx, 'Question'],
+                                                               temp_ans,
+                                                               preset,
+                                                               format='general')
+                    if many_shot_prefix is not None:
+                        prompt = many_shot_prefix + prompt
+                    if instruction_prompt: 
+                        prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + prompt
+                    
+                    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device)
+                    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+                    start_edit_location = input_ids.shape[-1] + 4 # account for the "lnA: " which is 4 tokens. Don't have to worry about BOS token because already in prompt
+                    
+                    if interventions == {}:
+                        intervene = id
+                    else:
+                        intervene = partial(intervention_fn, start_edit_location=start_edit_location)
+
+                    with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: 
+                        outputs = model(prompt_ids)[0].squeeze(0)
+                    
+                    outputs = outputs.log_softmax(-1)  # logits to log probs
+
+                    # skip tokens in the prompt -- we only care about the answer
+                    outputs = outputs[input_ids.shape[-1] - 1: -1, :]
+                    prompt_ids = prompt_ids[0, input_ids.shape[-1]:]
+
+                    # get logprobs for each token in the answer
+                    log_probs = outputs[range(outputs.shape[0]), prompt_ids.squeeze(0)]
+                    log_probs = log_probs[3:] # drop the '\nA:' prefix
+
+                    scores_false.append(log_probs.sum().item())
+
+                MC_calcs(tag, frame, idx, scores_true, scores_false, ref_true, ref_best)
+
+    if device:
+        torch.cuda.empty_cache()
+
+    return frame
+
+def run_ce_loss(model_key, model=None, tokenizer=None, device='cuda', interventions={}, intervention_fn=None, num_samples=100): 
+
+    # load owt text
+    # note this is tokenized with llama tokenizer
+    dataset = load_dataset("stas/openwebtext-10k")['train']
+    dataset = dataset.shuffle()
+    dataset = dataset.select(range(num_samples))
+
+    # tokenize
+    owt = dataset.map(lambda x: {'input_ids': torch.tensor(tokenizer(x['text'], return_tensors='pt')['input_ids'][:,:128])})
+    owt.set_format(type='torch', columns=['input_ids'])
+    
+    # define intervention
+    def id(head_output, layer_name):
+        return head_output
+    
+    if interventions == {}:
+        layers_to_intervene = []
+        intervention_fn = id
+    else: 
+        layers_to_intervene = list(interventions.keys())
+        intervention_fn = partial(intervention_fn, start_edit_location=0)
+
+    losses = []
+    rand_idxs = np.random.choice(len(owt), num_samples, replace=False).tolist()
+    with torch.no_grad(): 
+        for i in tqdm(rand_idxs):
+
+            input_ids = owt[i]['input_ids'][:, :128].to(device)
+            
+            with TraceDict(model, layers_to_intervene, edit_output=intervention_fn) as ret:
+                loss = model(input_ids, labels=input_ids).loss
+            
+            losses.append(loss.item())
+    
+    return np.mean(losses)
+
+def run_kl_wrt_orig(model_key, model=None, tokenizer=None, device='cuda', interventions={}, intervention_fn=None, num_samples=100, separate_kl_device=None): 
+
+    assert 'llama' in model_key or 'alpaca' in model_key or 'vicuna' in model_key, 'model must be llama model'
+
+    # load owt text
+    # note this is tokenized with llama tokenizer
+    dataset = load_dataset("stas/openwebtext-10k")['train']
+    dataset = dataset.shuffle()
+    dataset = dataset.select(range(num_samples))
+
+    # tokenize
+    owt = dataset.map(lambda x: {'input_ids': torch.tensor(tokenizer(x['text'], return_tensors='pt')['input_ids'][:,:128])})
+    owt.set_format(type='torch', columns=['input_ids'])
+    
+    # define intervention
+    def id(head_output, layer_name):
+        return head_output
+    
+    if interventions == {}:
+        layers_to_intervene = []
+        intervention_fn = id
+    else: 
+        layers_to_intervene = list(interventions.keys())
+        intervention_fn = partial(intervention_fn, start_edit_location=0)
+
+    kl_divs = []
+    rand_idxs = np.random.choice(len(owt), num_samples, replace=False).tolist()
+
+    if separate_kl_device is not None: 
+        orig_model = llama_iti.LLaMAForCausalLM.from_pretrained(ENGINE_MAP[model_key], torch_dtype=torch.float16, low_cpu_mem_usage=True)
+        orig_model.to('cuda')
+
+    with torch.no_grad(): 
+        for i in tqdm(rand_idxs):
+            input_ids = owt[i]['input_ids'][:, :128].to(device)
+
+            if separate_kl_device is not None: 
+                orig_logits = orig_model(input_ids.to('cuda')).logits.cpu().type(torch.float32)
+            else: 
+                orig_logits = model(input_ids).logits.cpu().type(torch.float32)
+                
+            orig_probs = F.softmax(orig_logits, dim=-1)
+
+            with TraceDict(model, layers_to_intervene, edit_output=intervention_fn) as ret:
+                logits = model(input_ids).logits.cpu().type(torch.float32)
+                probs  = F.softmax(logits, dim=-1)
+            
+            kl_div = (orig_probs * (orig_probs / probs).log()).sum() / (input_ids.shape[-1] * input_ids.shape[-2])
+            kl_divs.append(kl_div.item())
+
+    return np.mean(kl_divs)
+
+def alt_tqa_evaluate(models, metric_names, input_path, output_path, summary_path, device='cpu', verbose=False, preset='qa', interventions={}, intervention_fn=None, cache_dir=None, separate_kl_device=None, instruction_prompt=True, many_shot_prefix=None, judge_name=None, info_name=None): 
+    """
+    Inputs:
+    models: a dictionary of the form {model_name: model} where model is a HF transformer # TODO: doesn't work with models other than llama right now
+    metric_names: a list of metric names to evaluate (ex: ['mc', 'judge', 'info', 'bleu'])
+    input_path: where to draw TruthfulQA questions from
+    output_path: where to store model outputs and full metric outputs
+    summary_path: where to store metric summaries
+    interventions: a dictionary of the form {layer_name: [(head, direction, projected_mean, projected_std)]}
+    intervention_fn: a function that takes in a head output and a layer name and returns the intervened output
+
+    Outputs a pd dataframe with summary values
+    """
+
+    questions = utilities.load_questions(filename=input_path)
+
+    print("ASSUMES OPENAI_API_KEY ENVIRONMENT VARIABLE IS SET")
+    import os
+    openai.api_key = os.environ.get('OPENAI_API_KEY')
+    
+    for mdl in models.keys(): 
+
+        # gpt-3
+        if mdl in ['ada', 'babbage', 'curie', 'davinci']:  # gpt-3 models
+            try:
+                models.run_GPT3(questions, mdl, mdl, preset)
+                utilities.save_questions(questions, output_path)
+                if 'mc' in metric_names:
+                    models.run_probs_GPT3(questions, mdl, mdl, preset=preset)
+                    utilities.save_questions(questions, output_path)
+            except Exception as err:
+                print(err)
+
+        # gpt-2
+        if mdl in ['gpt2', 'gpt2-xl']:
+            try:
+                print(questions)
+                questions = models.run_answers(questions, mdl, mdl, preset, device=device, cache_dir=cache_dir)
+                utilities.save_questions(questions, output_path)
+                if 'mc' in metric_names:
+                    models.run_probs(questions, mdl, mdl, preset=preset, device=device, cache_dir=cache_dir)
+                    utilities.save_questions(questions, output_path)
+            except Exception as err:
+                print(err)
+
+        # llama
+        if mdl in ['llama_7B', 'alpaca_7B', 'vicuna_7B', 'llama2_chat_7B', 'llama2_chat_13B', 'llama2_chat_70B']: 
+
+            assert models[mdl] is not None, 'must provide llama model'
+            llama_model = models[mdl]
+            llama_tokenizer = llama_iti.LlamaTokenizer.from_pretrained(ENGINE_MAP[mdl])
+            
+            if 'judge' in metric_names or 'info' in metric_names:
+                questions = tqa_run_answers(questions, ENGINE_MAP[mdl], mdl, preset, model=llama_model, tokenizer=llama_tokenizer,
+                                device=device, cache_dir=cache_dir, verbose=verbose,
+                                interventions=interventions, intervention_fn=intervention_fn, instruction_prompt=instruction_prompt, many_shot_prefix=many_shot_prefix)
+
+            utilities.save_questions(questions, output_path)
+
+            if 'mc' in metric_names:
+                questions = tqa_run_probs(questions, ENGINE_MAP[mdl], mdl, model=llama_model, tokenizer=llama_tokenizer, preset=preset, device=device, cache_dir=cache_dir, verbose=False, interventions=interventions, intervention_fn=intervention_fn, instruction_prompt=instruction_prompt, many_shot_prefix=many_shot_prefix)
+                utilities.save_questions(questions, output_path)
+        
+        # gpt-neo
+        if mdl in ['neo-small', 'neo-med', 'neo-large']:
+            try:
+                models.run_answers(questions, ENGINE_MAP[mdl], mdl, preset,
+                                   device=device, cache_dir=cache_dir)
+                utilities.save_questions(questions, output_path)
+                if 'mc' in metric_names:
+                    models.run_probs(questions, ENGINE_MAP[mdl], mdl, preset=preset, device=device,
+                                     cache_dir=cache_dir)
+                    utilities.save_questions(questions, output_path)
+            except Exception as err:
+                print("ERROR")
+                print(err)
+
+        # unifiedqa
+        if mdl in ['uqa-small', 'uqa-base', 'uqa-large', 'uqa-3b']:
+            try:
+                models.run_UnifQA(questions, ENGINE_MAP[mdl], mdl, preset, device=device, cache_dir=cache_dir)
+                utilities.save_questions(questions, output_path)
+                if 'mc' in metric_names:
+                    models.run_probs_T5(questions, ENGINE_MAP[mdl], mdl, preset, device=device, cache_dir=cache_dir)
+                    utilities.save_questions(questions, output_path)
+            except Exception as err:
+                print(err)
+
+    for model_key in models.keys(): 
+
+        for metric in metric_names: 
+            if metric == 'mc':
+                continue
+            if metric == 'bleurt':
+                try:
+                    questions = metrics.run_BLEURT(model_key, questions, cache_dir=cache_dir)
+                    utilities.save_questions(questions, output_path)
+                except Exception as err:
+                    print(err)
+            elif metric in ['bleu', 'rouge']:
+                try:
+                    questions = metrics.run_bleu_and_rouge(model_key, questions)
+                    utilities.save_questions(questions, output_path)
+                except Exception as err:
+                    print(err)
+            elif metric in ['judge', 'info']:
+                try:
+                    if metric == 'judge':
+                        questions = metrics.run_end2end_GPT3(model_key, 'GPT-judge', judge_name, questions, info=False)
+                        utilities.save_questions(questions, output_path)
+                    else:
+                        questions = metrics.run_end2end_GPT3(model_key, 'GPT-info', info_name, questions, info=True)
+                        utilities.save_questions(questions, output_path)
+                except Exception as err:
+                    print(err)
+            else:
+                warnings.warn("Metric {0} not known, skipping!".format(metric), stacklevel=2)
+
+    # save all
+    utilities.save_questions(questions, output_path)
+
+    # format and print basic results
+    results = format_frame(questions)
+    results = results.mean(axis=0)
+    results = results.reset_index().rename(columns={'level_0': 'Model',
+                                                    'level_1': 'Metric',
+                                                    0: 'Value'})
+
+    # filter to most informative metrics
+    results = results[results['Metric'].isin(['MC1', 'MC2',
+                                              'bleu acc',
+                                              'rouge1 acc',
+                                              'BLEURT acc',
+                                              'GPT-judge acc',
+                                              'GPT-info acc'])]
+    results = pd.pivot_table(results, 'Value', 'Model', 'Metric')
+
+    # calculate cross entropy loss on owt and kl wrt to original unedited on owt
+    results['CE Loss'] = np.nan
+    results['KL wrt Orig'] = np.nan
+
+    for model_key in models.keys(): 
+        # if model_key not in questions.columns:
+        #     warnings.warn("Answers missing for {0}!".format(model_key), stacklevel=2)
+        #     continue
+        if 'llama' in model_key or 'alpaca' in model_key or 'vicuna' in model_key:
+            ce_loss = run_ce_loss(model_key, model=llama_model, tokenizer=llama_tokenizer, device=device, interventions=interventions, intervention_fn=intervention_fn)
+            kl_wrt_orig = run_kl_wrt_orig(model_key, model=llama_model, tokenizer=llama_tokenizer, device=device, interventions=interventions, intervention_fn=intervention_fn, separate_kl_device=separate_kl_device)
+
+        results.loc[model_key, 'CE Loss'] = ce_loss
+        results.loc[model_key, 'KL wrt Orig'] = kl_wrt_orig
+
+    # save results
+    results.to_csv(summary_path, index=False)
+    
+    return results
+
+def flattened_idx_to_layer_head(flattened_idx, num_heads):
+    return flattened_idx // num_heads, flattened_idx % num_heads
+
+def layer_head_to_flattened_idx(layer, head, num_heads):
+    return layer * num_heads + head
+
+def train_probes(seed, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels, num_layers, num_heads):
+    
+    all_head_accs = []
+    probes = []
+
+    all_X_train = np.concatenate([separated_head_wise_activations[i] for i in train_set_idxs], axis = 0)
+    all_X_val = np.concatenate([separated_head_wise_activations[i] for i in val_set_idxs], axis = 0)
+    y_train = np.concatenate([separated_labels[i] for i in train_set_idxs], axis = 0)
+    y_val = np.concatenate([separated_labels[i] for i in val_set_idxs], axis = 0)
+
+    for layer in tqdm(range(num_layers)): 
+        for head in range(num_heads): 
+            X_train = all_X_train[:,layer,head,:]
+            X_val = all_X_val[:,layer,head,:]
+    
+            clf = LogisticRegression(random_state=seed, max_iter=1000).fit(X_train, y_train)
+            y_pred = clf.predict(X_train)
+            y_val_pred = clf.predict(X_val)
+            all_head_accs.append(accuracy_score(y_val, y_val_pred))
+            probes.append(clf)
+
+    all_head_accs_np = np.array(all_head_accs)
+
+    return probes, all_head_accs_np
+
+def get_top_heads(train_idxs, val_idxs, separated_activations, separated_labels, num_layers, num_heads, seed, num_to_intervene, use_random_dir=False):
+
+    probes, all_head_accs_np = train_probes(seed, train_idxs, val_idxs, separated_activations, separated_labels, num_layers=num_layers, num_heads=num_heads)
+    all_head_accs_np = all_head_accs_np.reshape(num_layers, num_heads)
+
+    top_heads = []
+
+    top_accs = np.argsort(all_head_accs_np.reshape(num_heads*num_layers))[::-1][:num_to_intervene]
+    top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in top_accs]
+    if use_random_dir: 
+        # overwrite top heads with random heads, no replacement
+        random_idxs = np.random.choice(num_heads*num_layers, num_heads*num_layers, replace=False)
+        top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in random_idxs[:num_to_intervene]]
+
+    return top_heads, probes
+
+def get_interventions_dict(top_heads, probes, tuning_activations, num_heads, use_center_of_mass, use_random_dir, com_directions): 
+
+    interventions = {}
+    for layer, head in top_heads: 
+        interventions[f"model.layers.{layer}.self_attn.head_out"] = []
+    for layer, head in top_heads:
+        if use_center_of_mass: 
+            direction = com_directions[layer_head_to_flattened_idx(layer, head, num_heads)]
+        elif use_random_dir: 
+            direction = np.random.normal(size=(128,))
+        else: 
+            direction = probes[layer_head_to_flattened_idx(layer, head, num_heads)].coef_
+        direction = direction / np.linalg.norm(direction)
+        activations = tuning_activations[:,layer,head,:] # batch x 128
+        proj_vals = activations @ direction.T
+        proj_val_std = np.std(proj_vals)
+        interventions[f"model.layers.{layer}.self_attn.head_out"].append((head, direction.squeeze(), proj_val_std))
+    for layer, head in top_heads: 
+        interventions[f"model.layers.{layer}.self_attn.head_out"] = sorted(interventions[f"model.layers.{layer}.self_attn.head_out"], key = lambda x: x[0])
+
+    return interventions
+
+def get_separated_activations(labels, head_wise_activations): 
+
+    # separate activations by question
+    dataset=load_dataset('truthful_qa', 'multiple_choice')['validation']
+    actual_labels = []
+    for i in range(len(dataset)):
+        actual_labels.append(dataset[i]['mc2_targets']['labels'])
+
+    idxs_to_split_at = np.cumsum([len(x) for x in actual_labels])        
+
+    labels = list(labels)
+    separated_labels = []
+    for i in range(len(idxs_to_split_at)):
+        if i == 0:
+            separated_labels.append(labels[:idxs_to_split_at[i]])
+        else:
+            separated_labels.append(labels[idxs_to_split_at[i-1]:idxs_to_split_at[i]])
+    assert separated_labels == actual_labels
+
+    separated_head_wise_activations = np.split(head_wise_activations, idxs_to_split_at)
+
+    return separated_head_wise_activations, separated_labels, idxs_to_split_at
+
+def get_com_directions(num_layers, num_heads, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels): 
+
+    com_directions = []
+
+    for layer in range(num_layers): 
+        for head in range(num_heads): 
+            usable_idxs = np.concatenate([train_set_idxs, val_set_idxs], axis=0)
+            usable_head_wise_activations = np.concatenate([separated_head_wise_activations[i][:,layer,head,:] for i in usable_idxs], axis=0)
+            usable_labels = np.concatenate([separated_labels[i] for i in usable_idxs], axis=0)
+            true_mass_mean = np.mean(usable_head_wise_activations[usable_labels == 1], axis=0)
+            false_mass_mean = np.mean(usable_head_wise_activations[usable_labels == 0], axis=0)
+            com_directions.append(true_mass_mean - false_mass_mean)
+    com_directions = np.array(com_directions)
+
+    return com_directions
--- a/ylib/init.py
+++ b/ylib/init.py
--- a/ylib/yplot.py
+++ b/ylib/yplot.py
@ -0,0 +1,135 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas as pd
+import seaborn as sns
+import cv2
+# from .scipy_misc import toimage
+
+def plot_matrix(mat, path=None, xticks=None, yticks=None, xlim=None, ylim=None, figsize=(6,4), title=None, xlabel=None, ylabel=None, fontsize=20, cmap="YlGnBu"):
+    plt.figure(figsize=figsize)
+    # vis = vote_map.reshape(20, -1, 4).max(2)
+    with sns.axes_style("white"):
+        ax = sns.heatmap(mat, cmap=cmap)
+        if xticks is not None:
+            plt.xticks(xticks)
+        if yticks is not None:
+            plt.yticks(yticks)
+        if xlim is not None:
+            plt.xlim(xlim)
+        if ylim is not None:
+            plt.ylim(ylim)
+        if title is not None:
+            plt.title(title, fontsize=fontsize)
+        if xlabel is not None:
+            plt.xlabel(xlabel, fontsize=fontsize)
+        if ylabel is not None:
+            plt.ylabel(ylabel, fontsize=fontsize)
+    plt.tight_layout()
+    if path is not None:
+        plt.savefig(path)
+        plt.close()
+    else:
+        plt.show()
+
+
+
+
+def plot_shaded(mid, high, low, path=None, xticks=None, yticks=None, xlim=None, ylim=None, figsize=(6,4), title=None, xlabel=None, ylabel=None, fontsize=20, color = '#5f87bc'):
+    dim = len(mid)
+    plt.figure(figsize=figsize)
+    lw = 1
+    plt.plot(mid, linewidth=lw, color=color)
+    plt.plot(np.zeros(dim), linewidth=1/2., color=color)
+    plt.fill_between(range(dim), low, high, linewidth=0.1, alpha=0.5, color=color)
+    if xticks is not None:
+        plt.xticks(xticks)
+    if yticks is not None:
+        plt.yticks(yticks)
+    if xlim is not None:
+        plt.xlim(xlim)
+    if ylim is not None:
+        plt.ylim(ylim)
+    if title is not None:
+        plt.title(title, fontsize=fontsize)
+    if xlabel is not None:
+        plt.xlabel(xlabel, fontsize=fontsize)
+    if ylabel is not None:
+        plt.ylabel(ylabel, fontsize=fontsize)
+    plt.tight_layout()
+    if path is not None:
+        plt.savefig(path)
+    else:
+        plt.show()
+
+def plot_mean_std(mean, std, path=None, xticks=None, yticks=None, xlim=None, ylim=None, figsize=(6,4), title=None, xlabel=None, ylabel=None):
+    plot_shaded(mean, mean+std, mean-std, path=path, xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, figsize=figsize, title=title, xlabel=xlabel, ylabel=ylabel)
+
+
+def plot_lines(lines, path=None, legends=[], xticks=None, yticks=None, xlim=None, ylim=None, figsize=(6,4), linewidth=1, title=None, xlabel=None, ylabel=None, fontsize=20):
+    # color = '#5f87bc'
+    plt.figure(figsize=figsize)
+    if len(legends) > 0:
+        for line, legend in zip(lines, legends):
+            plt.plot(line, label=legend, linewidth=linewidth)
+        plt.legend()
+    else:
+        plt.plot(lines, linewidth=linewidth)
+    if xticks is not None:
+        plt.xticks(xticks)
+    if yticks is not None:
+        plt.yticks(yticks)
+    if xlim is not None:
+        plt.xlim(xlim)
+    if ylim is not None:
+        plt.ylim(ylim)
+    if title is not None:
+        plt.title(title, fontsize=fontsize)
+    if xlabel is not None:
+        plt.xlabel(xlabel, fontsize=fontsize)
+    if ylabel is not None:
+        plt.ylabel(ylabel, fontsize=fontsize)
+    plt.tight_layout()
+    if path is not None:
+        plt.savefig(path)
+    else:
+        plt.show()
+
+def plot_distrib(dist1, dist2=None, path=None, figsize=(6,4), xticks=None, yticks=None, xlim=None, ylim=None, linewidth=1, title=None, xlabel=None, ylabel=None, fontsize=20, fpr_shade=True, color='#5f87bc'):
+    plt.figure(figsize=figsize)
+    g1 = sns.distplot(pd.Series(data=dist1, name=''), color=color, hist=False, kde_kws={"shade": True, 'linewidth': linewidth})
+    if xlim is not None:
+        g1.set(xlim=xlim)
+    # g1.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[], xlim=xlim, ylim=ylim)
+    # g1.set(xlabel=None, ylabel=None)
+    if dist2 is not None:
+        g2 = sns.distplot(pd.Series(data=dist2, name=''), color='#444444', hist=False, kde_kws={"shade": True, 'linewidth': linewidth},)
+        # g2.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[], xlim=xlim, ylim=ylim)
+        # g2.set(xlabel=None, ylabel=None)
+        if fpr_shade:
+            arr = g2.get_children()[1].get_paths()[0].vertices
+            x, y = arr[:, 0], arr[:, 1]
+            x, y = x[y > 0], y[y > 0]
+            x, y = x[x.argsort()], y[x.argsort()]
+            mask = x > np.percentile(dist1, 5)
+            g2.fill_between(x[mask], y1=y[mask], y2=0, alpha=0.3, facecolor='#444444', hatch='////')
+    # sns.despine(bottom=True, left=True)
+    if title is not None:
+        plt.title(title, fontsize=fontsize)
+    if xlabel is not None:
+        plt.xlabel(xlabel, fontsize=fontsize)
+    if xlabel is not None:
+        plt.xlabel(xlabel, fontsize=fontsize)
+    if ylabel is not None:
+        plt.ylabel(ylabel, fontsize=fontsize)
+    plt.tight_layout()
+    if path is not None:
+        plt.savefig(path)
+    else:
+        plt.show()
+
+
+# def colormap(img, mode=cv2.COLORMAP_JET):
+#     img = toimage(img)
+#     colormask = cv2.applyColorMap(np.array(img), mode)[:,:,::-1]
+#     return colormask
--- a/ylib/ytool.py
+++ b/ylib/ytool.py
@ -0,0 +1,35 @@
+import torch
+import numpy as np
+
+
+def cluster_acc(y_true, y_pred, print_ret=False):
+    from scipy.optimize import linear_sum_assignment
+
+    y_true = y_true.astype(np.int64)
+    assert y_pred.size == y_true.size
+    D = max(y_pred.max(), y_true.max()) + 1
+    w = np.zeros((D, D), dtype=np.int64)
+    for i in range(y_pred.size):
+        w[y_pred[i], y_true[i]] += 1
+    row_ind, col_ind = linear_sum_assignment(w.max() - w)
+
+    acc = w[row_ind, col_ind].sum() / y_pred.size
+    if print_ret:
+        print("Fit acc: ", acc)
+    return acc
+
+
+class ArrayDataset(torch.utils.data.dataset.Dataset):
+
+    def __init__(self, features, labels=None) -> None:
+        self.features = features
+        self.labels = labels
+
+    def __getitem__(self, index):
+        if self.labels is None:
+            return self.features[index]
+        else:
+            return self.features[index], self.labels[index]
+
+    def __len__(self):
+        return len(self.features)
				`@ -0,0 +1 @@`
				`Subproject commit fdd8ad1c0d00a478cf8b0bb41a3ad8378c16293b`