!az login --use-device-code
Accessing the EHR data from the TRE
pixl
tre
1 Accessing EHR data from the TRE
This notebook describes the process of accessing EHR data stored within a TRE-accessible Azure Data Lake Storage Gen2 (ADLS) instance.
Note that PII has been masked from reports stored in the TRE and as a consequence structural changes may have appeared with respect to the original data.
Features intended for use as inputs to machine learning models should not include or be derived from structural information from reports (e.g. line breaks, sentence length) stored in the storage instance detailed below.
1.0.1 Authenticate with Azure
Open the link shown in a browser outside of the TRE, enter the code and log in with your user account
1.0.2 Set some key variables - the storage account name, input data filesystem name and the directory containing files
= "stpixldflowehrprod"
storage_account_name ="data-lake-storage-pixld-flowehr-prod"
input_data_fs_name="/" data_directory_path
Import dependencies and define functions to query data
#Function definitions inspired by MS docs
#at https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python
import os, uuid, sys
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
from azure.identity import DefaultAzureCredential, AzureCliCredential
class StorageClient:
def __init__(self, storage_account_name):
self.storage_account_name = storage_account_name
self.service_client = self.initialize_storage_account_ad()
def initialize_storage_account_ad(self):
try:
= AzureCliCredential()
credential = DataLakeServiceClient(account_url=f"https://{self.storage_account_name}.dfs.core.windows.net", credential=credential)
service_client
return service_client
except Exception as e:
print(e)
return
def download_file_from_directory(self, file_syst, directory, file_name):
try:
= self.service_client.get_file_system_client(file_system=file_syst)
file_system_client = file_system_client.get_directory_client(directory)
directory_client if not os.path.exists("downloaded_data"):
"downloaded_data")
os.makedirs(print
= open(f"downloaded_data/{file_name}",'wb')
local_file = directory_client.get_file_client(file_name)
file_client = file_client.download_file()
download = download.readall()
downloaded_bytes
local_file.write(downloaded_bytes)
local_file.close()return
except Exception as e:
print(e)
return
def list_directory_contents(self, file_syst, directory):
try:
= self.service_client.get_file_system_client(file_system=file_syst)
file_system_client
= file_system_client.get_paths(path=directory)
paths
= []
path_list for path in paths:
path_list.append(path.name)return path_list
except Exception as e:
print(e)
return
def create_directory(self, file_syst, directory):
try:
= self.service_client.get_file_system_client(file_system=file_syst)
file_system_client
file_system_client.create_directory(directory)return
except Exception as e:
print(e)
return
def upload_file_to_directory(self, file_syst, directory, uploaded_file_name, file_to_upload):
try:
= self.service_client.get_file_system_client(file_system=file_syst)
file_system_client = file_system_client.get_directory_client(directory)
directory_client = directory_client.create_file(uploaded_file_name)
file_client with open(file_to_upload, 'r') as local_file:
= local_file.read()
file_contents =file_contents, offset=0, length=len(file_contents))
file_client.append_data(datalen(file_contents))
file_client.flush_data(return
except Exception as e:
print(e)
return
1.0.3 Create an instance of our StorageClient object
= StorageClient(storage_account_name) client
1.0.4 List the contents of the specified directory within the ADLS file system
= client.list_directory_contents(input_data_fs_name, data_directory_path)
available_files print(available_files)
1.0.5 Download all files from a directory
"/",1)[-1]) for datafile in available_files] [client.download_file_from_directory(input_data_fs_name, data_directory_path, datafile.rsplit(
1.0.6 Download individual files from a directory
0].rsplit("/", 1)[-1]) client.download_file_from_directory(input_data_fs_name, data_directory_path, available_files[
1.1 Reading downloaded files with pandas
import re
import pandas as pd
= []
parquet = []
csv for x in available_files:
= re.match("^.*\.parquet", x)
parquet_re = re.match("^.*\.csv", x)
csv_re if parquet_re is not None:
0))
parquet.append(parquet_re.group(if csv_re is not None:
0)) csv.append(csv_re.group(
1.1.1 Parquet
= pd.read_parquet(f"downloaded_data/{parquet[0].rsplit('/',1)[-1]}")
local_df local_df.head()
1.1.2 CSV
= pd.read_csv(f"downloaded_data/{csv[0].rsplit('/',1)[-1]}")
local_df local_df.head()