Pandas DataFrame Parser
A Pandas DataFrame is a popular data structure in the Python programming language, commonly used for data manipulation and analysis. It provides a comprehensive set of tools for working with structured data, making it a versatile option for tasks such as data cleaning, transformation, and analysis.
This output parser allows users to specify an arbitrary Pandas DataFrame and query LLMs for data in the form of a formatted dictionary that extracts data from the corresponding DataFrame. Keep in mind that large language models are leaky abstractions! Youโll have to use an LLM with sufficient capacity to generate a well-formed query as per the defined format instructions.
Use Pandasโ DataFrame object to declare the DataFrame you wish to perform queries on.
import pprint
from typing import Any, Dict
import pandas as pd
from langchain.llms import OpenAI
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain.prompts import PromptTemplate
model_name = "text-davinci-003"
temperature = 0.5
model = OpenAI(model_name=model_name, temperature=temperature)
# Solely for documentation purposes.
def format_parser_output(parser_output: Dict[str, Any]) -> None:
for key in parser_output.keys():
parser_output[key] = parser_output[key].to_dict()
return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)
# Define your desired Pandas DataFrame.
df = pd.DataFrame(
{
"num_legs": [2, 4, 8, 0],
"num_wings": [2, 0, 0, 0],
"num_specimen_seen": [10, 2, 1, 8],
}
)
# Set up a parser + inject instructions into the prompt template.
parser = PandasDataFrameOutputParser(dataframe=df)
# Here's an example of a column operation being performed.
df_query = "Retrieve the num_wings column."
# Set up the prompt.
prompt = PromptTemplate(
template="Answer the user query.\n{format_instructions}\n{query}\n",
input_variables=["query"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
_input = prompt.format_prompt(query=df_query)
output = model(_input.to_string())
print("LLM Output:", output)
parser_output = parser.parse(output)
format_parser_output(parser_output)
LLM Output: column:num_wings
{'num_wings': {0: 2,
1: 0,
2: 0,
3: 0}}
# Here's an example of a row operation being performed.
df_query = "Retrieve the first row."
# Set up the prompt.
prompt = PromptTemplate(
template="Answer the user query.\n{format_instructions}\n{query}\n",
input_variables=["query"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
_input = prompt.format_prompt(query=df_query)
output = model(_input.to_string())
print("LLM Output:", output)
parser_output = parser.parse(output)
format_parser_output(parser_output)
LLM Output: row:1
{'1': {'num_legs': 4,
'num_specimen_seen': 2,
'num_wings': 0}}
# Here's an example of a random Pandas DataFrame operation limiting the number of rows
df_query = "Retrieve the average of the num_legs column from rows 1 to 3."
# Set up the prompt.
prompt = PromptTemplate(
template="Answer the user query.\n{format_instructions}\n{query}\n",
input_variables=["query"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
_input = prompt.format_prompt(query=df_query)
output = model(_input.to_string())
print("LLM Output:", output)
parser.parse(output)
LLM Output: mean:num_legs[1..3]
{'mean': 4.0}
# Here's an example of a poorly formatted query
df_query = "Retrieve the mean of the num_fingers column."
# Set up the prompt.
prompt = PromptTemplate(
template="Answer the user query.\n{format_instructions}\n{query}\n",
input_variables=["query"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
_input = prompt.format_prompt(query=df_query)
output = model(_input.to_string()) # Expected Output: "Invalid column: num_fingers".
print("LLM Output:", output)
parser.parse(output) # Expected Output: Will raise an OutputParserException.