Skip to content

Load bulk data in Python

This example is available in the Jupyter notebook at: github.com/intrahealth/client-registry-docs/docs/notebooks/load_bulk_data_in_python.ipynb

#!/usr/bin/env python3
from pathlib import Path
from requests_pkcs12 import get, post
import pandas as pd
import numpy as np

import recordlinkage

import fhirclient.models.patient as p
import fhirclient.models.humanname as hn
import fhirclient.models.contactpoint as cp
import fhirclient.models.fhirdate as fd
import fhirclient.models.identifier as ident
from fhirclient import client

import json
import time
import itertools

# suppress warning: "Certificate for localhost has no `subjectAltName`, falling back to check for a `commonName` for now"
import urllib3
urllib3.disable_warnings(urllib3.exceptions.SubjectAltNameWarning)
# versions
print("Pandas version: {0}".format(pd.__version__),'\n')
print("Python Record Linkage version: {0}".format(recordlinkage._version.get_versions()['version']),'\n')
print("Numpy version: {0}".format(np.__version__),'\n')
print("FHIR client version: {0}".format(client.__version__),'\n')
Pandas version: 1.0.3 

Python Record Linkage version: 0.14 

Numpy version: 1.18.2 

FHIR client version: 3.2.0 

# path to your git clone of github.com/intrahealth/client-registry
crhome = Path.home() / 'src' / 'github.com' / 'intrahealth' / 'client-registry'
clientcert = crhome / 'server' / 'sampleclientcertificates' / 'openmrs.p12'
servercert = crhome / 'server' / 'certificates' / 'server_cert.pem'
csv_file = crhome / 'tests' / 'uganda_data_v21_20201501.csv'
df_a = pd.read_csv(csv_file)
# df_a = df_a.set_index('rec_id')
print('Number of records :', len(df_a))
print(df_a.head())
Number of records : 5000
         rec_id sex date_of_birth given_name       surname phone_number  \
0  rec-2762-org   f      19671207     zuwena         acile   712 300633   
1  rec-2009-org   f      19761028     zuwena        lusike   772 614594   
2  rec-3269-org   f      19811002     zuwena      mungugeo   772 162632   
3  rec-1609-org   f      19270719    zuraika   akantambira   772 837692   
4  rec-2802-org   m                   zulfas      nyanchwo   782 855101   

       uganda_nin   art_number  
0  CF21927470OWMT   KMC-819708  
1  CF68167355NUZY   KUB-176148  
2  CF50136842UQFQ   MBA-746695  
3  CF68008770HZML   KMC-270901  
4  CM25736526XWGC   KSG-830566  
# some cleaning
df_a['rec_id'] = df_a['rec_id'].str.strip()
df_a['sex'] = df_a['sex'].str.strip()
df_a['given_name'] = df_a['given_name'].str.strip()
df_a['surname'] = df_a['surname'].str.strip()
df_a['date_of_birth'] = df_a['date_of_birth'].str.strip()
df_a['phone_number'] = df_a['phone_number'].str.strip()
df_a['uganda_nin'] = df_a['uganda_nin'].str.strip()
df_a['art_number'] = df_a['art_number'].str.strip()

df_a['sex']= df_a['sex'].replace('f', 'female')
df_a['sex']= df_a['sex'].replace('m', 'male')
print(df_a['sex'].value_counts())

# fhirclient validates and some birthdate fields are empty/improperly formatted
# remove non-digits
df_a['date_of_birth'] = df_a['date_of_birth'].str.extract('(\d+)', expand=False)
# force into datetime (coerce has benefit that it removes anything outside of 8 digits)
df_a['date_of_birth'] =  pd.to_datetime(df_a['date_of_birth'], errors='coerce')
# now back into str or fhirdate will complain
df_a['date_of_birth'] = df_a['date_of_birth'].apply(lambda x: x.strftime('%Y-%m-%d')if not pd.isnull(x) else '')

print(df_a.head())
female    3224
           963
male       809
d            1
r            1
q            1
k            1
Name: sex, dtype: int64
         rec_id     sex date_of_birth given_name      surname phone_number  \
0  rec-2762-org  female    1967-12-07     zuwena        acile   712 300633   
1  rec-2009-org  female    1976-10-28     zuwena       lusike   772 614594   
2  rec-3269-org  female    1981-10-02     zuwena     mungugeo   772 162632   
3  rec-1609-org  female    1927-07-19    zuraika  akantambira   772 837692   
4  rec-2802-org    male                   zulfas     nyanchwo   782 855101   

       uganda_nin  art_number  
0  CF21927470OWMT  KMC-819708  
1  CF68167355NUZY  KUB-176148  
2  CF50136842UQFQ  MBA-746695  
3  CF68008770HZML  KMC-270901  
4  CM25736526XWGC  KSG-830566  
# default server/path
server = "https://167.172.131.240:3000/Patient"
# 3 records, modify if more are required
limit = 3
for index, row in itertools.islice(df_a.iterrows(), limit):
# for index, row in df_a.iterrows():
    patient = p.Patient() # not using rec_id as pandas id, leaving empty
    patient.gender = row['sex']
    name = hn.HumanName()
    name.given = [row['given_name']]
    name.family = row['surname']
    name.use = 'official'
    patient.name = [name]
    phone = cp.ContactPoint()
    phone.system = 'phone'
    phone.value = row['phone_number']
    patient.telecom = [phone]
    patient.birthDate = fd.FHIRDate(row['date_of_birth'])
    emr = ident.Identifier()
    emr.system = 'http://clientregistry.org/openmrs'
    emr.value = row['rec_id']
    art = ident.Identifier()
    art.system = 'http://system1/artnumber'
    art.value = row['art_number']
    nin = ident.Identifier()
    nin.system = 'http://system1/nationalid'
    nin.value = row['uganda_nin']
    patient.identifier = [emr, art, nin]
    # print(json.dumps(patient.as_json()))

    headers = {'Content-Type': 'application/json'}
    start = time.time()
    response = post(server, headers=headers, data=json.dumps(patient.as_json()), 
                    pkcs12_filename=clientcert, pkcs12_password='', verify=servercert)
    end = time.time()
    print(index, response.headers['location'], " | ", round((end - start), 1), "ms") # response.headers['Date']
    print(response.headers)


Last update: April 13, 2020